library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.6
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts ---------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
DataCamp offer interactive courses related to R Programming. While some is review, it is helpful to see other perspectives on material. As well, DataCamp has some interesting materials on packages that I want to learn better (ggplot2, dplyr, ggvis, etc.). This document summarizes a few key insights from:
This document is currently split between _v003 and _v003_a and _v003_b and _v003_c due to the need to keep the number of DLL that it opens below the hard-coded maximum. This introductory section needs to be re-written, and the contents consolidated, at a future date.
The original DataCamp_Insights_v001 and DataCamp_Insights_v002 documents have been split for this document:
Chapter 1 - Overview and Introduction
What is a hierarchical model?
Parts of a regression:
Random effects in regression:
School data:
Example code includes:
rawStudent <- read.csv("./RInputFiles/classroom.csv")
studentData <- rawStudent %>%
mutate(sex=factor(sex, labels=c("male", "female")), minority=factor(minority, labels=c("no", "yes")))
# Plot the data
ggplot(data = studentData, aes(x = housepov, y = mathgain)) +
geom_point() +
geom_smooth(method = 'lm')
# Fit a linear model
summary( lm(mathgain ~ housepov , data = studentData))
##
## Call:
## lm(formula = mathgain ~ housepov, data = studentData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -168.226 -22.222 -1.306 19.763 195.156
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.937 1.674 34.02 <2e-16 ***
## housepov 3.531 7.515 0.47 0.639
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.63 on 1188 degrees of freedom
## Multiple R-squared: 0.0001858, Adjusted R-squared: -0.0006558
## F-statistic: 0.2208 on 1 and 1188 DF, p-value: 0.6385
# I have aggregated the data for you into two new datasets at the classroom- and school-levels (As a side note, if you want to learn how to aggregate data, the dplyr or data.table courses teach these skills)
# We will also compare the model outputs across all three outputs
# Note: how we aggregate the data is important
# I aggregated the data by taking the mean across the student data (in pseudo-code: mean(mathgain) by school or mean(mathgain) by classroom),
# but another reasonable method for aggregating the data would be to aggregate by classroom first and school second
classData <- studentData %>%
group_by(schoolid, classid) %>%
summarize_at(vars(mathgain, mathprep, housepov, yearstea), mean, na.rm=TRUE)
str(classData)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 312 obs. of 6 variables:
## $ schoolid: int 1 1 2 2 2 3 3 3 3 4 ...
## $ classid : int 160 217 197 211 307 11 137 145 228 48 ...
## $ mathgain: num 65.7 57.4 49.5 69 68.8 ...
## $ mathprep: num 2 3.25 2.5 2.33 2.3 3.83 2.25 3 2.17 2 ...
## $ housepov: num 0.082 0.082 0.082 0.082 0.082 0.086 0.086 0.086 0.086 0.365 ...
## $ yearstea: num 1 2 1 2 12.5 ...
## - attr(*, "vars")= chr "schoolid"
## - attr(*, "drop")= logi TRUE
schoolData <- studentData %>%
group_by(schoolid) %>%
summarize_at(vars(mathgain, mathprep, housepov, yearstea), mean, na.rm=TRUE)
str(schoolData)
## Classes 'tbl_df', 'tbl' and 'data.frame': 107 obs. of 5 variables:
## $ schoolid: int 1 2 3 4 5 6 7 8 9 10 ...
## $ mathgain: num 59.6 65 88.9 35.2 60.2 ...
## $ mathprep: num 2.91 2.35 2.95 2 3.75 ...
## $ housepov: num 0.082 0.082 0.086 0.365 0.511 0.044 0.148 0.085 0.537 0.346 ...
## $ yearstea: num 1.73 6.02 14.93 22 3 ...
# First, plot the hosepov and mathgain at the classroom-level from the classData data.frame
ggplot(data = classData, aes(x = housepov, y = mathgain)) +
geom_point() +
geom_smooth(method = 'lm')
# Second, plot the hosepov and mathgain at the school-level from the schoolData data.frame
ggplot(data = schoolData, aes(x = housepov, y = mathgain)) +
geom_point() +
geom_smooth(method = 'lm')
# Third, compare your liner regression results from the previous expercise to the two new models
summary( lm(mathgain ~ housepov, data = studentData)) ## student-level data
##
## Call:
## lm(formula = mathgain ~ housepov, data = studentData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -168.226 -22.222 -1.306 19.763 195.156
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.937 1.674 34.02 <2e-16 ***
## housepov 3.531 7.515 0.47 0.639
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.63 on 1188 degrees of freedom
## Multiple R-squared: 0.0001858, Adjusted R-squared: -0.0006558
## F-statistic: 0.2208 on 1 and 1188 DF, p-value: 0.6385
summary( lm(mathgain ~ housepov, data = classData)) ## class-level data
##
## Call:
## lm(formula = mathgain ~ housepov, data = classData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80.479 -14.444 -1.447 13.151 156.516
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.160 2.542 22.879 <2e-16 ***
## housepov -3.236 10.835 -0.299 0.765
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.14 on 310 degrees of freedom
## Multiple R-squared: 0.0002876, Adjusted R-squared: -0.002937
## F-statistic: 0.08918 on 1 and 310 DF, p-value: 0.7654
summary( lm(mathgain ~ housepov, data = schoolData)) ## school-level data
##
## Call:
## lm(formula = mathgain ~ housepov, data = schoolData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.660 -9.947 -2.494 9.546 41.445
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59.338 2.624 22.616 <2e-16 ***
## housepov -11.948 10.987 -1.087 0.279
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.8 on 105 degrees of freedom
## Multiple R-squared: 0.01114, Adjusted R-squared: 0.00172
## F-statistic: 1.183 on 1 and 105 DF, p-value: 0.2793
# Plot the means of your data, predictor is your x-variable, response is your y-variable, and intDemo is your data.frame
intDemo <- data.frame(predictor=factor(c(rep("a", 5), rep("b", 5), rep("c", 5))),
response=c(-1.207, 0.277, 1.084, -2.346, 0.429, 5.759, 4.138, 4.18, 4.153, 3.665, 9.046, 8.003, 8.447, 10.129, 11.919)
)
str(intDemo)
## 'data.frame': 15 obs. of 2 variables:
## $ predictor: Factor w/ 3 levels "a","b","c": 1 1 1 1 1 2 2 2 2 2 ...
## $ response : num -1.207 0.277 1.084 -2.346 0.429 ...
ggIntDemo <- ggplot(intDemo, aes(x = predictor, y = response) ) +
geom_point() +
theme_minimal() + stat_summary(fun.y = "mean", color = "red",
size = 3, geom = "point") +
xlab("Intercept groups")
print(ggIntDemo)
# Fit a linear model to your data where response is "predicted by"(~) predictor
intModel <- lm( response ~ predictor - 1 , data = intDemo)
summary(intModel)
##
## Call:
## lm(formula = response ~ predictor - 1, data = intDemo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9934 -0.7842 -0.2260 0.7056 2.4102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## predictora -0.3526 0.5794 -0.609 0.554
## predictorb 4.3790 0.5794 7.558 6.69e-06 ***
## predictorc 9.5088 0.5794 16.412 1.38e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.296 on 12 degrees of freedom
## Multiple R-squared: 0.9646, Adjusted R-squared: 0.9557
## F-statistic: 109 on 3 and 12 DF, p-value: 5.696e-09
extractAndPlotResults <- function(intModel){
intCoefPlot <- broom::tidy(intModel)
intCoefPlot$term <- factor(gsub("predictor", "", intCoefPlot$term))
plotOut <- ggIntDemo + geom_point(data = intCoefPlot,
aes(x = term, y = estimate),
position = position_dodge(width = 0.4),
color = 'blue', size = 8, alpha = 0.25)
print(plotOut)
}
# Run the next code that extracts out the model's coeffiecents and plots them
extractAndPlotResults(intModel)
multIntDemo <- data.frame(group=factor(c(rep("a", 5), rep("b", 5), rep("c", 5))),
x=rep(0:4, times=3),
intercept=c(4.11, -1.69, 1.09, 1.9, 1.21, 4.63, 10.29, 4.67, 12.06, 4.78, 15.22, 19.15, 4.44, 8.88, 9.47),
response=c(4.11, 2.31, 9.09, 13.9, 17.21, 4.63, 14.29, 12.67, 24.06, 20.78, 15.22, 23.15, 12.44, 20.88, 25.47)
)
str(multIntDemo)
## 'data.frame': 15 obs. of 4 variables:
## $ group : Factor w/ 3 levels "a","b","c": 1 1 1 1 1 2 2 2 2 2 ...
## $ x : int 0 1 2 3 4 0 1 2 3 4 ...
## $ intercept: num 4.11 -1.69 1.09 1.9 1.21 ...
## $ response : num 4.11 2.31 9.09 13.9 17.21 ...
plot_output1 <- function(out1){
ggmultIntgDemo <- ggplot( multIntDemo, aes(x = x, y = response) ) +
geom_point(aes(color = group)) +
theme_minimal() +
scale_color_manual(values = c("blue", "black", "red")) +
stat_smooth(method = 'lm', fill = NA, color = 'orange', size = 3)
print(ggmultIntgDemo)
}
plot_output2 <- function(out2){
out2Tidy <- broom::tidy(out2)
out2Tidy$term <- gsub("group", "", out2Tidy$term)
out2Plot <- data.frame(group = pull(out2Tidy[ -1, 1]),
slope = pull(out2Tidy[ 1, 2]),
intercept = pull(out2Tidy[ -1, 2])
)
ggmultIntgDemo2 <- ggplot( multIntDemo, aes(x = x, y = response) ) +
geom_point(aes(color = group))+
theme_minimal() +
scale_color_manual(values = c("blue", "black", "red")) +
geom_abline(data = out2Plot,
aes(intercept = intercept, slope = slope, color = group))
print(ggmultIntgDemo2)
}
plot_output3 <- function(out3){
ggmultIntgDemo3 <- ggplot( multIntDemo, aes(x = x, y = response) ) +
geom_point(aes(color = group)) +
theme_minimal() +
scale_color_manual(values = c("blue", "black", "red")) +
stat_smooth(method = 'lm', aes(color = group), fill = NA)
print(ggmultIntgDemo3)
}
# First, run a model without considering different intercept for each group
out1 <- lm( response ~ x, data=multIntDemo )
summary(out1)
##
## Call:
## lm(formula = response ~ x, data = multIntDemo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.101 -4.021 -2.011 3.590 11.739
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.141 2.615 3.113 0.00824 **
## x 3.270 1.068 3.062 0.00908 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.848 on 13 degrees of freedom
## Multiple R-squared: 0.4191, Adjusted R-squared: 0.3744
## F-statistic: 9.378 on 1 and 13 DF, p-value: 0.009081
plot_output1(out1)
# Considering same slope but different intercepts
out2 <- lm( response ~ x + group - 1, data=multIntDemo )
summary(out2)
##
## Call:
## lm(formula = response ~ x + group - 1, data = multIntDemo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.992 -2.219 -0.234 1.810 6.988
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 3.2697 0.7516 4.350 0.001155 **
## groupa 2.7847 2.3767 1.172 0.266085
## groupb 8.7467 2.3767 3.680 0.003625 **
## groupc 12.8927 2.3767 5.425 0.000209 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.117 on 11 degrees of freedom
## Multiple R-squared: 0.9534, Adjusted R-squared: 0.9364
## F-statistic: 56.23 on 4 and 11 DF, p-value: 2.97e-07
plot_output2(out2)
# Consdering different slope and intercept for each group (i.e., an interaction)
out3 <- lm( response ~ x + group - 1 + x:group, multIntDemo)
summary(out3)
##
## Call:
## lm(formula = response ~ x + group - 1 + x:group, data = multIntDemo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.992 -2.429 -0.234 2.368 5.541
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 3.779 1.308 2.888 0.017941 *
## groupa 1.766 3.205 0.551 0.595053
## groupb 6.872 3.205 2.144 0.060621 .
## groupc 15.786 3.205 4.925 0.000819 ***
## x:groupb 0.428 1.851 0.231 0.822263
## x:groupc -1.956 1.851 -1.057 0.318050
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.138 on 9 degrees of freedom
## Multiple R-squared: 0.9615, Adjusted R-squared: 0.9358
## F-statistic: 37.42 on 6 and 9 DF, p-value: 7.263e-06
plot_output3(out3)
multIntDemo$intercept <- c(-0.87, 3.35, 1.25, 0.88, -1.05, 4.55, 1.22, 3.34, 1.26, 3.75, 7.71, 9.59, 2.28, 1.9, 13.35)
multIntDemo$response <- c(-0.87, 6.35, 7.25, 9.88, 10.95, 4.55, 4.22, 9.34, 10.26, 15.75, 7.71, 12.59, 8.28, 10.9, 25.35)
# Run model
outLmer <- lme4::lmer( response ~ x + ( 1 | group), multIntDemo)
# Look at model outputs
summary( outLmer )
## Linear mixed model fit by REML ['lmerMod']
## Formula: response ~ x + (1 | group)
## Data: multIntDemo
##
## REML criterion at convergence: 76.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.31582 -0.61105 -0.01593 0.45125 2.19118
##
## Random effects:
## Groups Name Variance Std.Dev.
## group (Intercept) 7.98 2.825
## Residual 10.71 3.272
## Number of obs: 15, groups: group, 3
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 3.5540 2.1913 1.622
## x 2.9733 0.5975 4.977
##
## Correlation of Fixed Effects:
## (Intr)
## x -0.545
broom::tidy( outLmer )
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## # A tibble: 4 x 5
## term estimate std.error statistic group
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 (Intercept) 3.55 2.19 1.62 fixed
## 2 x 2.97 0.597 4.98 fixed
## 3 sd_(Intercept).group 2.82 NA NA group
## 4 sd_Observation.Residual 3.27 NA NA Residual
extractAndPlotOutput <- function(outLmer, slope=3){
multIntDemo$lmerPredict <- predict(outLmer)
ggmultIntgDemo2 <- ggplot( multIntDemo, aes(x = x, y = response) ) +
geom_point(aes(color = group))+
theme_minimal() +
scale_color_manual(values = c("blue", "black", "red")) +
geom_abline(data = multIntDemo,
aes(intercept = intercept, slope = slope, color = group))
outPlot <- ggmultIntgDemo2 +
geom_line( data = multIntDemo,
aes(x = x, y = lmerPredict, color = group),
linetype = 2)
print(outPlot)
}
# Extract predictor variables and plot
extractAndPlotOutput(outLmer)
# Random effect slopes
multIntDemo$response <- c(-0.72, 1.5, 4.81, 6.61, 13.62, 10.21, 9.64, 11.91, 16.39, 16.97, 8.76, 14.79, 15.83, 15.27, 17.36)
multIntDemo$intercept <- c(-0.72, -1.5, -1.19, -2.39, 1.62, 10.21, 6.64, 5.91, 7.39, 4.97, 8.76, 11.79, 9.83, 6.27, 5.36)
outLmer2 <- lme4::lmer( response ~ ( x|group ), multIntDemo)
summary(outLmer2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: response ~ (x | group)
## Data: multIntDemo
##
## REML criterion at convergence: 69.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.56747 -0.54105 -0.06286 0.75141 1.27947
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## group (Intercept) 273.998 16.553
## x 6.096 2.469 -1.00
## Residual 2.466 1.570
## Number of obs: 15, groups: group, 3
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 21.676 1.383 15.67
broom::tidy(outLmer2)
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## # A tibble: 5 x 5
## term estimate std.error statistic group
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 (Intercept) 21.7 1.38 15.7 fixed
## 2 sd_(Intercept).group 16.6 NA NA group
## 3 sd_x.group 2.47 NA NA group
## 4 cor_(Intercept).x.group -1 NA NA group
## 5 sd_Observation.Residual 1.57 NA NA Residual
plotOutput <- function(outLmer2){
multIntDemo$lmerPredict2 <- predict(outLmer2)
ggmultIntgDemo3 <- ggplot( multIntDemo, aes(x = x, y = response) ) +
geom_point(aes(color = group)) +
theme_minimal() +
scale_color_manual(values = c("blue", "black", "red")) +
stat_smooth(method = 'lm', aes(color = group), fill = NA)
plotOut <- ggmultIntgDemo3 +
geom_line( data = multIntDemo,
aes(x = x, y = lmerPredict2, color = group),
linetype = 2)
print(plotOut)
}
# Extract and plot
plotOutput(outLmer2)
# Mixed effect model
lmerModel <- lme4::lmer(mathgain ~ sex +
mathprep + mathknow + (1|classid) +
(1|schoolid), data = studentData, na.action = "na.omit",
REML = TRUE)
summary(lmerModel)
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## mathgain ~ sex + mathprep + mathknow + (1 | classid) + (1 | schoolid)
## Data: studentData
##
## REML criterion at convergence: 10677.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.3203 -0.6146 -0.0294 0.5467 5.5331
##
## Random effects:
## Groups Name Variance Std.Dev.
## classid (Intercept) 103.57 10.177
## schoolid (Intercept) 85.44 9.244
## Residual 1019.47 31.929
## Number of obs: 1081, groups: classid, 285; schoolid, 105
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 52.250 3.838 13.613
## sexfemale -1.526 2.041 -0.747
## mathprep 2.426 1.298 1.869
## mathknow 2.405 1.299 1.851
##
## Correlation of Fixed Effects:
## (Intr) sexfml mthprp
## sexfemale -0.268
## mathprep -0.878 0.001
## mathknow -0.003 0.011 0.005
extractAndPlot <- function(lmerModel){
modelOutPlot <- broom::tidy(lmerModel, conf.int = TRUE)
modelOutPlot <- modelOutPlot[ modelOutPlot$group =="fixed" &
modelOutPlot$term != "(Intercept)", ]
plotOut <- ggplot(modelOutPlot, aes(x = term, y = estimate,
ymin = conf.low,
ymax = conf.high)) +
theme_minimal() +
geom_hline(yintercept = 0.0, color = 'red', size = 2.0) +
geom_point() +
geom_linerange() + coord_flip()
print(plotOut)
}
# Extract and plot
extractAndPlot(lmerModel)
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
Chapter 2 - Linear Mixed-Effect Models
Linear mixed effect model - Birth rates data:
Understanding and reporting the outputs of lmer:
Statistical inference with Maryland crime data:
Example code includes:
# Read in births data
rawBirths <- read.csv("./RInputFiles/countyBirthsDataUse.csv")
countyBirthsData <- rawBirths
str(countyBirthsData)
## 'data.frame': 580 obs. of 8 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ TotalPopulation : int 203709 115620 103057 104173 660367 156993 353089 415395 226519 119565 ...
## $ BirthRate : num 11.5 12.1 11.8 12.4 13.3 ...
## $ AverageBirthWeight: num 3261 3209 3239 3207 3177 ...
## $ AverageAgeofMother: num 27.5 26.3 25.8 26.9 27.9 ...
## $ CountyName : Factor w/ 472 levels "Ada","Adams",..: 22 64 141 189 200 229 248 273 278 279 ...
## $ State : Factor w/ 50 levels "AK","AL","AR",..: 2 2 2 2 2 2 2 2 2 2 ...
# First, build a lmer with state as a random effect. Then look at the model's summary and the plot of residuals.
birthRateStateModel <- lme4::lmer(BirthRate ~ (1|State), data=countyBirthsData)
summary(birthRateStateModel)
## Linear mixed model fit by REML ['lmerMod']
## Formula: BirthRate ~ (1 | State)
## Data: countyBirthsData
##
## REML criterion at convergence: 2411
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7957 -0.6056 -0.1063 0.5211 5.5948
##
## Random effects:
## Groups Name Variance Std.Dev.
## State (Intercept) 1.899 1.378
## Residual 3.256 1.804
## Number of obs: 578, groups: State, 50
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 12.3362 0.2216 55.67
plot(birthRateStateModel)
# Next, plot the predicted values from the model ontop of the plot shown during the video.
countyBirthsData$birthPredictState <- predict(birthRateStateModel, countyBirthsData)
ggplot() + theme_minimal() +
geom_point(data =countyBirthsData, aes(x = TotalPopulation, y = BirthRate)) +
geom_point(data = countyBirthsData, aes(x = TotalPopulation, y = birthPredictState),
color = 'blue', alpha = 0.5
)
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
# Include the AverageAgeofMother as a fixed effect within the lmer and state as a random effect
ageMotherModel <- lme4::lmer( BirthRate ~ AverageAgeofMother + (1|State), data=countyBirthsData)
summary(ageMotherModel)
## Linear mixed model fit by REML ['lmerMod']
## Formula: BirthRate ~ AverageAgeofMother + (1 | State)
## Data: countyBirthsData
##
## REML criterion at convergence: 2347.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9602 -0.6086 -0.1042 0.5144 5.2686
##
## Random effects:
## Groups Name Variance Std.Dev.
## State (Intercept) 1.562 1.250
## Residual 2.920 1.709
## Number of obs: 578, groups: State, 50
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 27.57033 1.81801 15.165
## AverageAgeofMother -0.53549 0.06349 -8.434
##
## Correlation of Fixed Effects:
## (Intr)
## AvrgAgfMthr -0.994
# Compare the random-effect model to the linear effect model
summary(lm(BirthRate ~ AverageAgeofMother, data = countyBirthsData))
##
## Call:
## lm(formula = BirthRate ~ AverageAgeofMother, data = countyBirthsData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8304 -1.3126 -0.1795 1.2198 8.7327
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.06637 1.83374 15.851 <2e-16 ***
## AverageAgeofMother -0.59380 0.06441 -9.219 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.065 on 576 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1286, Adjusted R-squared: 0.1271
## F-statistic: 84.99 on 1 and 576 DF, p-value: < 2.2e-16
# Include the AverageAgeofMother as a correlated random-effect slope parameter
ageMotherModelRandomCorrelated <- lme4::lmer(BirthRate ~ AverageAgeofMother + (AverageAgeofMother|State),
countyBirthsData)
summary(ageMotherModelRandomCorrelated)
## Linear mixed model fit by REML ['lmerMod']
## Formula: BirthRate ~ AverageAgeofMother + (AverageAgeofMother | State)
## Data: countyBirthsData
##
## REML criterion at convergence: 2337.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.8399 -0.5966 -0.1133 0.5228 5.1815
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## State (Intercept) 78.75816 8.8746
## AverageAgeofMother 0.08482 0.2912 -0.99
## Residual 2.80306 1.6742
## Number of obs: 578, groups: State, 50
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 27.22042 2.41281 11.282
## AverageAgeofMother -0.52347 0.08302 -6.306
##
## Correlation of Fixed Effects:
## (Intr)
## AvrgAgfMthr -0.997
# Include the AverageAgeofMother as a correlated random-effect slope parameter
ageMotherModelRandomUncorrelated <- lme4::lmer(BirthRate ~ AverageAgeofMother +
(AverageAgeofMother || State), data=countyBirthsData
)
summary(ageMotherModelRandomUncorrelated)
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## BirthRate ~ AverageAgeofMother + ((1 | State) + (0 + AverageAgeofMother |
## State))
## Data: countyBirthsData
##
## REML criterion at convergence: 2347.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9602 -0.6086 -0.1042 0.5144 5.2686
##
## Random effects:
## Groups Name Variance Std.Dev.
## State (Intercept) 1.562 1.250
## State.1 AverageAgeofMother 0.000 0.000
## Residual 2.920 1.709
## Number of obs: 578, groups: State, 50
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 27.57033 1.81801 15.165
## AverageAgeofMother -0.53549 0.06349 -8.434
##
## Correlation of Fixed Effects:
## (Intr)
## AvrgAgfMthr -0.994
out <- ageMotherModelRandomUncorrelated
# Extract the fixed-effect coefficients
lme4::fixef(out)
## (Intercept) AverageAgeofMother
## 27.5703303 -0.5354886
# Extract the random-effect coefficients
lme4::ranef(out)
## $State
## (Intercept) AverageAgeofMother
## AK 1.03549149 0
## AL -0.52500819 0
## AR 0.48023356 0
## AZ -1.04094123 0
## CA 0.50530542 0
## CO 0.09585582 0
## CT -1.91638101 0
## DC 0.96029532 0
## DE -0.38938118 0
## FL -1.87440508 0
## GA 0.39776424 0
## HI 0.08513474 0
## IA 0.96279528 0
## ID 1.17377458 0
## IL -0.12739337 0
## IN -0.32655206 0
## KS 0.85650338 0
## KY 0.64871300 0
## LA 1.04437463 0
## MA -1.40082047 0
## MD 0.10842918 0
## ME -1.63520397 0
## MI -1.13797832 0
## MN 0.93266233 0
## MO 0.07081901 0
## MS -0.21397453 0
## MT -0.13190265 0
## NC -0.28681241 0
## ND 0.99847758 0
## NE 1.49390428 0
## NH -1.45440958 0
## NJ -0.30089452 0
## NM -0.69753301 0
## NV 0.09012925 0
## NY -0.58163335 0
## OH -1.07390325 0
## OK 0.77997159 0
## OR -0.75845586 0
## PA -1.59332743 0
## RI -1.36395356 0
## SC -0.59295090 0
## SD 1.35141914 0
## TN -0.13512968 0
## TX 1.70872465 0
## UT 3.66056804 0
## VA 1.59187553 0
## VT -0.51105276 0
## WA 0.23008359 0
## WI -0.51646717 0
## WV -0.67684007 0
# Estimate the confidence intervals
(ciOut <- confint(out))
## Computing profile confidence intervals ...
## 2.5 % 97.5 %
## .sig01 0.0000000 1.61243911
## .sig02 0.0000000 0.05033967
## .sigma 1.6091449 1.81592859
## (Intercept) 24.0121848 31.14668434
## AverageAgeofMother -0.6605319 -0.41123099
# Technical note: Extracting out the regression coefficients from lmer is tricky (see discussion between the lmer and broom authors development)
# Extract out the parameter estimates and confidence intervals and manipulate the data
dataPlot <- data.frame(cbind( lme4::fixef(out), ciOut[ 4:5, ]))
rownames(dataPlot)[1] <- "Intercept"
colnames(dataPlot) <- c("mean", "l95", "u95")
dataPlot$parameter <- rownames(dataPlot)
# Print the new dataframe
print(dataPlot)
## mean l95 u95 parameter
## Intercept 27.5703303 24.0121848 31.146684 Intercept
## AverageAgeofMother -0.5354886 -0.6605319 -0.411231 AverageAgeofMother
# Plot the results using ggplot2
ggplot(dataPlot, aes(x = parameter, y = mean,
ymin = l95, ymax = u95)) +
geom_hline( yintercept = 0, color = 'red' ) +
geom_linerange() + geom_point() + coord_flip() + theme_minimal()
# Read in crime data
rawCrime <- read.csv("./RInputFiles/MDCrime.csv")
MDCrime <- rawCrime
str(MDCrime)
## 'data.frame': 192 obs. of 5 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ County: Factor w/ 24 levels "ALLEGANY","ANNE ARUNDEL",..: 2 3 4 5 6 7 8 9 10 11 ...
## $ Year : int 2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
## $ Crime : int 3167 10871 5713 257 149 374 490 729 181 752 ...
## $ Year2 : int 0 0 0 0 0 0 0 0 0 0 ...
plot1 <- ggplot(data = MDCrime, aes(x = Year, y = Crime, group = County)) +
geom_line() + theme_minimal() +
ylab("Major crimes reported per county")
print(plot1)
plot1 + geom_smooth(method = 'lm')
# Null hypothesis testing uses p-values to see if a variable is "significant"
# Recently, the abuse and overuse of null hypothesis testing and p-values has caused the American Statistical Association to issue a statement about the use of p-values
# Because of these criticisms and other numerical challenges, Doug Bates (the creator of the lme4 package) does not include p-values as part of his package
# However, you may still want to estimate p-values, because p-values are sill commonly used. Several packages exist, including the lmerTest package
# https://www.amstat.org/asa/files/pdfs/P-ValueStatement.pdf
# Load lmerTest
# library(lmerTest)
# Fit the model with Year as both a fixed and random-effect
lme4::lmer(Crime ~ Year + (1 + Year | County) , data = MDCrime)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge with max|grad| = 0.338309
## (tol = 0.002, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?
## Linear mixed model fit by REML ['lmerMod']
## Formula: Crime ~ Year + (1 + Year | County)
## Data: MDCrime
## REML criterion at convergence: 2891.03
## Random effects:
## Groups Name Std.Dev. Corr
## County (Intercept) 655.2118
## Year 0.8322 1.00
## Residual 328.2865
## Number of obs: 192, groups: County, 24
## Fixed Effects:
## (Intercept) Year
## 136642.97 -67.33
## convergence code 0; 3 optimizer warnings; 0 lme4 warnings
# Fit the model with Year2 rather than Year
out <- lme4::lmer(Crime ~ Year2 + (1 + Year2 | County) , data = MDCrime)
# Examine the model's output
summary(out)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Crime ~ Year2 + (1 + Year2 | County)
## Data: MDCrime
##
## REML criterion at convergence: 2535.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.8081 -0.2235 -0.0390 0.2837 3.0768
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## County (Intercept) 7587959 2754.63
## Year2 16945 130.17 -0.91
## Residual 8425 91.79
## Number of obs: 192, groups: County, 24
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 1577.28 562.42 2.804
## Year2 -67.33 26.73 -2.519
##
## Correlation of Fixed Effects:
## (Intr)
## Year2 -0.907
## Build the Null model with only County as a random-effect
null_model <- lme4::lmer(Crime ~ (1 | County) , data = MDCrime)
## Build the Year2 model with Year2 as a fixed and random slope and County as the random-effect
year_model <- lme4::lmer(Crime ~ Year2 + (1 + Year2 | County) , data = MDCrime)
## Compare the two models using an anova
anova(null_model, year_model)
## refitting model(s) with ML (instead of REML)
## Data: MDCrime
## Models:
## null_model: Crime ~ (1 | County)
## year_model: Crime ~ Year2 + (1 + Year2 | County)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## null_model 3 2954.4 2964.2 -1474.2 2948.4
## year_model 6 2568.9 2588.4 -1278.4 2556.9 391.52 3 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Chapter 3 - Generalized Linear Mixed-Effect Models
Crash course on GLMs - relaxing the assumptions around normality of the residuals:
Binomial data - modeling data with only two outcomes:
Count data:
Example code includes:
# In this case study, we will be working with simulated dose-response data
# The response is mortality (1) or survival (0) at the end of a study. During this exercise, we will fit a logistic regression using all three methods described in the video
# You have been given two datasets. dfLong has the data in a "long" format with each row corresponding to an observation (i.e., a 0 or 1)
# dfShort has the data in an aggregated format with each row corresponding to a treatment (e.g., 6 successes, 4 failures, number of replicates = 10, proportion = 0.6)
dfLong <- data.frame(dose=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10),
mortality=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1)
)
str(dfLong)
## 'data.frame': 120 obs. of 2 variables:
## $ dose : num 0 0 0 0 0 0 0 0 0 0 ...
## $ mortality: num 0 0 0 0 0 0 0 0 0 0 ...
dfShort <- dfLong %>%
group_by(dose) %>%
summarize(mortality=sum(mortality), nReps=n()) %>%
mutate(survival=nReps-mortality, mortalityP=mortality/nReps)
dfShort
## # A tibble: 6 x 5
## dose mortality nReps survival mortalityP
## <dbl> <dbl> <int> <dbl> <dbl>
## 1 0 0 20 20.0 0
## 2 2.00 4.00 20 16.0 0.200
## 3 4.00 8.00 20 12.0 0.400
## 4 6.00 10.0 20 10.0 0.500
## 5 8.00 11.0 20 9.00 0.550
## 6 10.0 13.0 20 7.00 0.650
# Fit a glm using data in a long format
fitLong <- glm(mortality ~ dose, data = dfLong, family = "binomial")
summary(fitLong)
##
## Call:
## glm(formula = mortality ~ dose, family = "binomial", data = dfLong)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5916 -0.8245 -0.4737 1.0440 1.8524
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.13075 0.44532 -4.785 1.71e-06 ***
## dose 0.30663 0.06821 4.495 6.95e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 159.76 on 119 degrees of freedom
## Residual deviance: 134.71 on 118 degrees of freedom
## AIC: 138.71
##
## Number of Fisher Scoring iterations: 3
# Fit a glm using data in a short format with two columns
fitShort <- glm( cbind(mortality , survival ) ~ dose , data = dfShort, family = "binomial")
summary(fitShort)
##
## Call:
## glm(formula = cbind(mortality, survival) ~ dose, family = "binomial",
## data = dfShort)
##
## Deviance Residuals:
## 1 2 3 4 5 6
## -2.1186 0.2316 1.0698 0.6495 -0.2699 -0.6634
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.13075 0.44537 -4.784 1.72e-06 ***
## dose 0.30663 0.06822 4.495 6.97e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 31.6755 on 5 degrees of freedom
## Residual deviance: 6.6214 on 4 degrees of freedom
## AIC: 27.415
##
## Number of Fisher Scoring iterations: 4
# Fit a glm using data in a short format with weights
fitShortP <- glm( mortalityP ~ dose , data = dfShort, weights = nReps , family = "binomial")
summary(fitShortP)
##
## Call:
## glm(formula = mortalityP ~ dose, family = "binomial", data = dfShort,
## weights = nReps)
##
## Deviance Residuals:
## 1 2 3 4 5 6
## -2.1186 0.2316 1.0698 0.6495 -0.2699 -0.6634
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.13075 0.44537 -4.784 1.72e-06 ***
## dose 0.30663 0.06822 4.495 6.97e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 31.6755 on 5 degrees of freedom
## Residual deviance: 6.6214 on 4 degrees of freedom
## AIC: 27.415
##
## Number of Fisher Scoring iterations: 4
y <- c(0, 1, 0, 1, 0, 1, 0, 1, 0, 2, 1, 2, 0, 1, 1, 0, 1, 5, 1, 1)
x <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)
# Fit the linear model
summary(lm(y ~ x))
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3677 -0.6145 -0.2602 0.4297 3.4805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.15263 0.50312 0.303 0.7651
## x 0.07594 0.04200 1.808 0.0873 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.083 on 18 degrees of freedom
## Multiple R-squared: 0.1537, Adjusted R-squared: 0.1067
## F-statistic: 3.269 on 1 and 18 DF, p-value: 0.08733
# Fit the generalized linear model
summary(glm(y ~ x, family = "poisson"))
##
## Call:
## glm(formula = y ~ x, family = "poisson")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6389 -0.9726 -0.3115 0.5307 2.1559
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.04267 0.60513 -1.723 0.0849 .
## x 0.08360 0.04256 1.964 0.0495 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 23.589 on 19 degrees of freedom
## Residual deviance: 19.462 on 18 degrees of freedom
## AIC: 52.17
##
## Number of Fisher Scoring iterations: 5
# Often, we want to "look" at our data and trends in our data
# ggplot2 allows us to add trend lines to our data
# The defult lines are created using a technique called local regression
# However, we can specify different models, including GLMs
# During this exercise, we'll see how to plot a GLM
# Plot the data using jittered points and the default stat_smooth
ggplot(data = dfLong, aes(x = dose, y = mortality)) +
geom_jitter(height = 0.05, width = 0.1) +
stat_smooth(fill = 'pink', color = 'red')
## `geom_smooth()` using method = 'loess'
# Plot the data using jittered points and the the glm stat_smooth
ggplot(data = dfLong, aes(x = dose, y = mortality)) +
geom_jitter(height = 0.05, width = 0.1) +
stat_smooth(method = 'glm', method.args = list(family = "binomial"))
# library(lmerTest)
df <- data.frame(dose=rep(rep(c(0, 2, 4, 6, 8, 10), each=20), times=3),
mortality=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1),
replicate=factor(rep(letters[1:3], each=120))
)
str(df)
## 'data.frame': 360 obs. of 3 variables:
## $ dose : num 0 0 0 0 0 0 0 0 0 0 ...
## $ mortality: num 0 0 0 0 0 0 0 0 0 0 ...
## $ replicate: Factor w/ 3 levels "a","b","c": 1 1 1 1 1 1 1 1 1 1 ...
glmerOut <- lme4::glmer(mortality ~ dose + (1|replicate), family = 'binomial', data = df)
summary(glmerOut)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: mortality ~ dose + (1 | replicate)
## Data: df
##
## AIC BIC logLik deviance df.resid
## 378.1 389.8 -186.0 372.1 357
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.3484 -0.6875 -0.3031 0.6413 2.1907
##
## Random effects:
## Groups Name Variance Std.Dev.
## replicate (Intercept) 6.658e-15 8.16e-08
## Number of obs: 360, groups: replicate, 3
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.38736 0.27334 -8.734 <2e-16 ***
## dose 0.40948 0.04414 9.276 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## dose -0.884
# library(lmerTest)
# Fit the model and look at its summary
# modelOut <- lme4::glmer( cbind(Purchases, Pass) ~ friend + ranking + (1|city), data = allData, family = 'binomial')
# summary( modelOut)
# Compare outputs to a lmer model
# summary(lme4::lmer( Purchases/( Purchases + Pass) ~ friend + ranking + (1|city), data = allData))
# Run the code to see how to calculate odds ratios
# summary(modelOut)
# exp(fixef(modelOut)[2])
# exp(confint(modelOut)[3, ])
# Load lmerTest
# library(lmerTest)
userGroups <- data.frame(group=factor(rep(rep(LETTERS[1:4], each=10), times=2)),
webpage=factor(rep(c("old", "new"), each=40)),
clicks=c(0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 1, 1, 1, 2, 0, 1, 1, 0, 3, 2, 3, 1, 2, 4, 2, 1, 0, 2, 0, 1, 2, 0, 2, 1, 1, 2, 4, 2, 8, 1, 1, 1, 2, 1, 1, 0, 0, 3, 0, 1, 4, 1, 2, 0, 1, 1, 0, 0, 3, 2, 0, 3, 1, 2, 2, 0, 2, 3, 1, 3, 2, 4, 4, 2, 1, 5, 2)
)
str(userGroups)
## 'data.frame': 80 obs. of 3 variables:
## $ group : Factor w/ 4 levels "A","B","C","D": 1 1 1 1 1 1 1 1 1 1 ...
## $ webpage: Factor w/ 2 levels "new","old": 2 2 2 2 2 2 2 2 2 2 ...
## $ clicks : num 0 0 0 0 0 0 2 0 0 0 ...
# Fit a Poisson glmer
summary( lme4::glmer(clicks ~ webpage + (1|group), family = 'poisson', data = userGroups))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: clicks ~ webpage + (1 | group)
## Data: userGroups
##
## AIC BIC logLik deviance df.resid
## 255.5 262.6 -124.7 249.5 77
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.3999 -0.9104 -0.2340 0.4978 5.6126
##
## Random effects:
## Groups Name Variance Std.Dev.
## group (Intercept) 0.07093 0.2663
## Number of obs: 80, groups: group, 4
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.5524 0.1797 3.074 0.00211 **
## webpageold -0.5155 0.1920 -2.685 0.00726 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## webpageold -0.400
# library(lmerTest)
rawIL <- read.csv("./RInputFiles/ILData.csv")
ILdata <- rawIL
str(ILdata)
## 'data.frame': 1808 obs. of 4 variables:
## $ age : Factor w/ 4 levels "15_19","20_24",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 0 0 0 0 0 0 0 0 0 0 ...
## $ county: Factor w/ 47 levels "ALEXANDER","BROWN",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ count : int 0 0 0 5 0 7 0 4 0 12 ...
# Age goes before year
modelOut <- lme4::glmer(count ~ age + year + (year|county), family = 'poisson', data = ILdata)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 0.00144074 (tol =
## 0.001, component 1)
summary(modelOut)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: count ~ age + year + (year | county)
## Data: ILdata
##
## AIC BIC logLik deviance df.resid
## 3215.6 3259.6 -1599.8 3199.6 1800
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.4511 -0.0151 -0.0056 -0.0022 4.0053
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## county (Intercept) 129.9459 11.3994
## year 0.0648 0.2546 -1.00
## Number of obs: 1808, groups: county, 47
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.76258 2.13022 -5.052 4.36e-07 ***
## age20_24 -0.04152 0.03690 -1.125 0.261
## age25_29 -1.16262 0.05290 -21.976 < 2e-16 ***
## age30_34 -2.28278 0.08487 -26.898 < 2e-16 ***
## year 0.32708 0.25422 1.287 0.198
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) a20_24 a25_29 a30_34
## age20_24 -0.008
## age25_29 -0.006 0.341
## age30_34 -0.004 0.213 0.148
## year -0.764 0.000 0.000 0.000
## convergence code: 0
## Model failed to converge with max|grad| = 0.00144074 (tol = 0.001, component 1)
# Extract out fixed effects
lme4::fixef(modelOut)
## (Intercept) age20_24 age25_29 age30_34 year
## -10.76258497 -0.04151848 -1.16262225 -2.28277972 0.32708039
# Extract out random effects
lme4::ranef(modelOut)
## $county
## (Intercept) year
## ALEXANDER -0.2847724 0.006331741
## BROWN -0.2847724 0.006331741
## CALHOUN -0.2847724 0.006331741
## CARROLL 12.2418514 -0.260423999
## CASS -0.2847724 0.006331741
## CLARK 12.2137668 -0.268553354
## CLAY -0.2847724 0.006331741
## CRAWFORD 12.5037407 -0.265752695
## CUMBERLAND -0.2847724 0.006331741
## DE WITT 12.7456078 -0.277675211
## DOUGLAS 13.0751590 -0.306329903
## EDGAR 12.3642794 -0.283606045
## EDWARDS -0.2847724 0.006331741
## FAYETTE 12.8094530 -0.273060474
## FORD -0.2847724 0.006331741
## GALLATIN -0.2847724 0.006331741
## GREENE -0.2847724 0.006331741
## HAMILTON -0.2847724 0.006331741
## HANCOCK 12.8581265 -0.305650287
## HARDIN -0.2847724 0.006331741
## HENDERSON -0.2847724 0.006331741
## IROQUOIS 13.1616741 -0.311372907
## JASPER -0.2847724 0.006331741
## JERSEY 12.9202747 -0.272284048
## JO DAVIESS 12.7409389 -0.289747791
## JOHNSON -0.2847724 0.006331741
## LAWRENCE 12.3713561 -0.268571236
## MARSHALL -0.2847724 0.006331741
## MASON -0.2847724 0.006331741
## MENARD -0.2180916 0.004849989
## MERCER 12.7534193 -0.271678572
## MOULTRIE -0.2180916 0.004849989
## PIATT 12.5653132 -0.296687752
## PIKE 12.5310614 -0.259211299
## POPE -0.2180916 0.004849989
## PULASKI -0.2180916 0.004849989
## PUTNAM -0.2180916 0.004849989
## RICHLAND 12.0350865 -0.273928951
## SCHUYLER -0.2180916 0.004849989
## SCOTT -0.2180916 0.004849989
## SHELBY 12.5183293 -0.283472292
## STARK -0.2180916 0.004849989
## UNION 13.1465272 -0.308673332
## WABASH -0.2180916 0.004849989
## WASHINGTON -0.2180916 0.004849989
## WAYNE 12.1148896 -0.253234752
## WHITE -0.2180916 0.004849989
# Run code to see one method for plotting the data
ggplot(data = ILdata, aes(x = year, y = count, group = county)) +
geom_line() +
facet_grid(age ~ . ) +
stat_smooth( method = 'glm',
method.args = list( family = "poisson"), se = FALSE,
alpha = 0.5) +
theme_minimal()
Chapter 4 - Repeated Measures
An introduction to repeated measures:
Sleep study:
Hate in NY state?
Wrap up:
Example code includes:
y <- c(0.23, 2.735, -0.038, 6.327, -0.643, 1.69, -1.378, -1.228, -0.252, 2.014, -0.073, 6.101, 0.213, 3.127, -0.29, 8.395, -0.33, 2.735, 0.223, 1.301)
treat <- rep(c("before", "after"), times=10)
x <- rep(letters[1:10], each=2)
# Run a standard, non-paired t-test
t.test(y[treat == "before"], y[treat == "after"], paired = FALSE)
##
## Welch Two Sample t-test
##
## data: y[treat == "before"] and y[treat == "after"]
## t = -3.9043, df = 9.5409, p-value = 0.003215
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.594744 -1.512256
## sample estimates:
## mean of x mean of y
## -0.2338 3.3197
# Run a standard, paired t-test
t.test(y[treat == "before"], y[treat == "after"], paired = TRUE)
##
## Paired t-test
##
## data: y[treat == "before"] and y[treat == "after"]
## t = -4.2235, df = 9, p-value = 0.002228
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.456791 -1.650209
## sample estimates:
## mean of the differences
## -3.5535
library(lmerTest)
## Loading required package: lme4
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
library(lme4)
# Run the paired-test like before
t.test(y[treat == "before"], y[treat == "after"], paired = TRUE)
##
## Paired t-test
##
## data: y[treat == "before"] and y[treat == "after"]
## t = -4.2235, df = 9, p-value = 0.002228
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.456791 -1.650209
## sample estimates:
## mean of the differences
## -3.5535
# Run a repeated-measures ANOVA
anova(lmer( y ~ treat + (1|x)))
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## treat 63.137 63.137 1 9 17.838 0.002228 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data(sleepstudy, package="lme4")
str(sleepstudy)
## 'data.frame': 180 obs. of 3 variables:
## $ Reaction: num 250 259 251 321 357 ...
## $ Days : num 0 1 2 3 4 5 6 7 8 9 ...
## $ Subject : Factor w/ 18 levels "308","309","310",..: 1 1 1 1 1 1 1 1 1 1 ...
# Plot the data
ggplot(data = sleepstudy) +
geom_line(aes(x = Days, y = Reaction, group = Subject)) +
stat_smooth(aes(x = Days, y = Reaction), method = 'lm', size = 3, se = FALSE)
# Build a lm
lm( Reaction ~ Days, data = sleepstudy)
##
## Call:
## lm(formula = Reaction ~ Days, data = sleepstudy)
##
## Coefficients:
## (Intercept) Days
## 251.41 10.47
# Build a lmer
(lmerOut <- lmer( Reaction ~ Days + (1|Subject), data = sleepstudy))
## Linear mixed model fit by REML ['lmerModLmerTest']
## Formula: Reaction ~ Days + (1 | Subject)
## Data: sleepstudy
## REML criterion at convergence: 1786.465
## Random effects:
## Groups Name Std.Dev.
## Subject (Intercept) 37.12
## Residual 30.99
## Number of obs: 180, groups: Subject, 18
## Fixed Effects:
## (Intercept) Days
## 251.41 10.47
# The lmer model you built during the previous exercise has been saved as lmerOut
# During this exercise, you will examine the effects of drug type using both an ANOVA framework and a regression framework
# Run an anova
anova(lmerOut)
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## Days 162703 162703 1 161 169.4 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Look at the regression coefficients
summary(lmerOut)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Reaction ~ Days + (1 | Subject)
## Data: sleepstudy
##
## REML criterion at convergence: 1786.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.2257 -0.5529 0.0109 0.5188 4.2506
##
## Random effects:
## Groups Name Variance Std.Dev.
## Subject (Intercept) 1378.2 37.12
## Residual 960.5 30.99
## Number of obs: 180, groups: Subject, 18
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 251.4051 9.7467 22.8102 25.79 <2e-16 ***
## Days 10.4673 0.8042 161.0000 13.02 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Days -0.371
# Read in NY hate data
rawHate <- read.csv("./RInputFiles/hateNY.csv")
hate <- rawHate
str(hate)
## 'data.frame': 233 obs. of 4 variables:
## $ Year : int 2010 2011 2012 2013 2014 2015 2016 2013 2010 2011 ...
## $ County : Factor w/ 59 levels "Albany","Allegany",..: 1 1 1 1 1 1 1 2 3 3 ...
## $ TotalIncidents: int 13 7 5 3 3 3 3 1 22 11 ...
## $ Year2 : int 0 1 2 3 4 5 6 3 0 1 ...
ggplot( data = hate, aes(x = Year, y = TotalIncidents, group = County)) +
geom_line() +
geom_smooth(method = 'lm', se = FALSE)
# During this exercise, you will build a glmer
# Because most of the incidents are small count values, use a Poisson (R function family poisson) error term
# First, build a model using the actually year (variable Year, such as 2006, 2007, etc) - this model will fail
# Second, build a model using the rescaled year (variable Year2, such as 0, 1, 2, etc)
# This demonstrates the importance of considering where the intercept is located when building regression models
# Recall that a variable x can be both a fixed and random effect in a lmer() or glmer(): for example lmer(y ~ x + (x| group) demonstrates this syntax
# glmer with raw Year
glmer(TotalIncidents ~ Year + (Year|County), data = hate, family = "poisson")
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge with max|grad| = 0.370207
## (tol = 0.001, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: TotalIncidents ~ Year + (Year | County)
## Data: hate
## AIC BIC logLik deviance df.resid
## 1165.2746 1182.5298 -577.6373 1155.2746 228
## Random effects:
## Groups Name Std.Dev. Corr
## County (Intercept) 217.8915
## Year 0.1084 -1.00
## Number of obs: 233, groups: County, 59
## Fixed Effects:
## (Intercept) Year
## 295.4814 -0.1464
## convergence code 0; 3 optimizer warnings; 0 lme4 warnings
# glmer with scaled Year, Year2
glmerOut <- glmer(TotalIncidents ~ Year2 + (Year2|County), data = hate, family = "poisson")
summary(glmerOut)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: TotalIncidents ~ Year2 + (Year2 | County)
## Data: hate
##
## AIC BIC logLik deviance df.resid
## 1165.3 1182.5 -577.6 1155.3 228
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.5434 -0.4864 -0.1562 0.3319 3.1939
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## County (Intercept) 1.16291 1.0784
## Year2 0.01175 0.1084 0.02
## Number of obs: 233, groups: County, 59
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.27952 0.16600 7.708 1.28e-14 ***
## Year2 -0.14622 0.03324 -4.398 1.09e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Year2 -0.338
# Extract and manipulate data
countyTrend <- ranef(glmerOut)$County
countyTrend$county <- factor(row.names(countyTrend), levels =row.names(countyTrend)[order(countyTrend$Year2)])
# Plot results
ggplot(data = countyTrend, aes(x = county, y = Year2)) + geom_point() +
coord_flip() +
ylab("Change in hate crimes per year") +
xlab("County")
Chapter 1 - Forecasting Demand with Time Series
Loading data in to an xts object:
ARIMA Time Series 101:
Forecasting and Evaluating:
Example code includes:
# Read in beverages data
rawBev <- read.csv("./RInputFiles/Bev.csv")
bev <- rawBev
str(bev)
## 'data.frame': 176 obs. of 14 variables:
## $ M.hi.p : num 59.2 56.3 56.3 49.3 61.3 ...
## $ M.lo.p : num 29.2 26.3 26.2 26.1 25.9 ...
## $ MET.hi.p: num 63.7 60.3 60.8 55.1 65.1 ...
## $ MET.lo.p: num 26 25.5 25.7 26.5 25.7 ...
## $ MET.sp.p: num 50.1 48.8 48.6 47.7 50.8 ...
## $ SEC.hi.p: num 58.6 54.6 57.9 49.7 63.7 ...
## $ SEC.lo.p: num 29.2 26.3 26.2 26.1 25.9 ...
## $ M.hi : int 458 477 539 687 389 399 392 417 568 583 ...
## $ M.lo : int 1455 1756 2296 3240 2252 1901 1939 1999 1798 1558 ...
## $ MET.hi : int 2037 1700 1747 2371 1741 2072 2353 2909 3204 2395 ...
## $ MET.lo : int 3437 3436 3304 3864 3406 3418 3553 3376 3233 3262 ...
## $ MET.sp : int 468 464 490 657 439 453 423 408 501 481 ...
## $ SEC.hi : int 156 151 178 217 141 149 134 148 195 170 ...
## $ SEC.lo : int 544 624 611 646 624 610 623 599 551 539 ...
# Load xts package
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
library(forecast)
# Create the dates object as an index for your xts object
dates <- seq(as.Date("2014-01-19"), length = 176, by = "weeks")
# Create an xts object called bev_xts
bev_xts <- xts(bev, order.by = dates)
# Create the individual region sales as their own objects
MET_hi <- bev_xts[,"MET.hi"]
MET_lo <- bev_xts[,"MET.lo"]
MET_sp <- bev_xts[,"MET.sp"]
# Sum the region sales together
MET_t <- MET_hi + MET_lo + MET_sp
# Plot the metropolitan region total sales
plot(MET_t)
# Split the data into training and validation
MET_t_train <- MET_t[index(MET_t) < "2017-01-01"]
MET_t_valid <- MET_t[index(MET_t) >= "2017-01-01"]
# Use auto.arima() function for metropolitan sales
MET_t_model <- auto.arima(MET_t_train)
# Forecast the first 22 weeks of 2017
forecast_MET_t <- forecast(MET_t_model, h = 22)
# Plot this forecast #
plot(forecast_MET_t)
# Convert to numeric for ease
for_MET_t <- as.numeric(forecast_MET_t$mean)
v_MET_t <- as.numeric(MET_t_valid)
# Calculate the MAE
MAE <- mean(abs(for_MET_t - v_MET_t))
# Calculate the MAPE
MAPE <- 100*mean(abs(for_MET_t - v_MET_t)/v_MET_t)
# Print to see how good your forecast is!
print(MAE)
## [1] 898.8403
print(MAPE)
## [1] 17.10332
# Convert your forecast to an xts object
for_dates <- seq(as.Date("2017-01-01"), length = 22, by = "weeks")
for_MET_t_xts <- xts(forecast_MET_t$mean, order.by = for_dates)
# Plot the validation data set
plot(for_MET_t_xts, main = 'Forecast Comparison', ylim = c(4000, 8500))
# Overlay the forecast of 2017
lines(MET_t_valid, col = "blue")
# Plot the validation data set
plot(MET_t_valid, main = 'Forecast Comparison', ylim = c(4000, 8500))
# Overlay the forecast of 2017
lines(for_MET_t_xts, col = "blue")
# Convert the limits to xts objects
lower <- xts(forecast_MET_t$lower[, 2], order.by = for_dates)
upper <- xts(forecast_MET_t$upper[, 2], order.by = for_dates)
# Adding confidence intervals of forecast to plot
lines(lower, col = "blue", lty = "dashed")
lines(upper, col = "blue", lty = "dashed")
Chapter 2 - Components of Demand
Price elasticity:
Seasonal/holiday/promotional effects:
Forecasting with regression:
Example code includes:
bev_xts_train <- bev_xts[index(bev_xts) < "2017-01-01"]
bev_xts_valid <- bev_xts[index(bev_xts) >= "2017-01-01"]
# Save the prices of each product
l_MET_hi_p <- log(as.vector(bev_xts_train[, "MET.hi.p"]))
# Save as a data frame
MET_hi_train <- data.frame(as.vector(log(MET_hi[index(MET_hi) < "2017-01-01"])), l_MET_hi_p)
colnames(MET_hi_train) <- c("log_sales", "log_price")
# Calculate the regression
model_MET_hi <- lm(log_sales ~ log_price, data = MET_hi_train)
# Plot the product's sales
plot(MET_hi)
# Plot the product's price
MET_hi_p <- bev_xts_train[, "MET.hi.p"]
plot(MET_hi_p)
# Create date indices for New Year's week
n.dates <- as.Date(c("2014-12-28", "2015-12-27", "2016-12-25"))
# Create xts objects for New Year's
newyear <- as.xts(rep(1, 3), order.by = n.dates)
# Create sequence of dates for merging
dates_train <- seq(as.Date("2014-01-19"), length = 154, by = "weeks")
# Merge training dates into New Year's object
newyear <- merge(newyear, dates_train, fill = 0)
# Add newyear variable to your data frame
MET_hi_train <- data.frame(MET_hi_train, newyear=as.vector(newyear))
# Build regressions for the product
model_MET_hi_full <- lm(log_sales ~ log_price + newyear, data = MET_hi_train)
# Subset the validation prices #
l_MET_hi_p_valid <- log(as.vector(bev_xts_valid[, "MET.hi.p"]))
# Create a validation data frame #
MET_hi_valid <- data.frame(l_MET_hi_p_valid)
colnames(MET_hi_valid) <- "log_price"
# Predict the log of sales for your high end product
pred_MET_hi <- predict(model_MET_hi, MET_hi_valid)
# Convert predictions out of log scale
pred_MET_hi <- exp(pred_MET_hi)
# Convert to an xts object
dates_valid <- seq(as.Date("2017-01-01"), length = 22, by = "weeks")
pred_MET_hi_xts <- xts(pred_MET_hi, order.by = dates_valid)
# Plot the forecast
plot(pred_MET_hi_xts)
# Calculate and print the MAPE
MET_hi_v <- bev_xts_valid[,"MET.hi"]
MAPE <- 100*mean(abs((pred_MET_hi_xts - MET_hi_v)/MET_hi_v))
print(MAPE)
## [1] 29.57455
Chapter 3 - Blending Regression with Time Series
Residuals from regression model:
Forecasting residuals:
Transfer functions and ensembling:
Example code includes:
# Calculate the residuals from the model
MET_hi_full_res <- resid(model_MET_hi_full)
# Convert the residuals to an xts object
MET_hi_full_res <- xts(MET_hi_full_res, order.by = dates_train)
# Plot the histogram of the residuals
hist(MET_hi_full_res)
# Plot the residuals over time
plot(MET_hi_full_res)
# Build an ARIMA model on the residuals: MET_hi_arima
MET_hi_arima <- auto.arima(MET_hi_full_res)
# Look at a summary of the model
summary(MET_hi_arima)
## Series: MET_hi_full_res
## ARIMA(2,0,2) with zero mean
##
## Coefficients:
## ar1 ar2 ma1 ma2
## 1.5736 -0.7833 -0.8149 0.2865
## s.e. 0.0992 0.0758 0.1266 0.0941
##
## sigma^2 estimated as 0.03921: log likelihood=32.19
## AIC=-54.37 AICc=-53.97 BIC=-39.19
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0004334041 0.1954382 0.145018 -47.18396 223.0022 0.5227844
## ACF1
## Training set -0.01034041
# Forecast 22 weeks with your model: for_MET_hi_arima
for_MET_hi_arima <- forecast(MET_hi_arima, h=22)
# Print first 10 observations
head(for_MET_hi_arima$mean, n = 10)
## Time Series:
## Start = 1079
## End = 1142
## Frequency = 0.142857142857143
## [1] -0.07662326 -0.10617141 -0.10705342 -0.08529747 -0.05037188
## [6] -0.01245420 0.01985656 0.04100076 0.04896519 0.04493645
# Convert your forecasts into an xts object
dates_valid <- seq(as.Date("2017-01-01"), length = 22, by = "weeks")
for_MET_hi_arima <- xts(for_MET_hi_arima$mean, order.by = dates_valid)
# Plot the forecast
plot(for_MET_hi_arima)
# Convert your residual forecast to the exponential version
for_MET_hi_arima <- exp(for_MET_hi_arima)
# Multiply your forecasts together!
for_MET_hi_final <- for_MET_hi_arima * pred_MET_hi_xts
# Plot the final forecast - don't touch the options!
plot(for_MET_hi_final, ylim = c(1000, 4300))
# Overlay the validation data set
lines(MET_hi_v, col = "blue")
# Calculate the MAE
MAE <- mean(abs(for_MET_hi_final - MET_hi_v))
print(MAE)
## [1] 481.6678
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_MET_hi_final - MET_hi_v)/MET_hi_v)
print(MAPE)
## [1] 28.82836
# Build an ARIMA model using the auto.arima function
MET_hi_model_arima <- auto.arima(MET_hi)
# Forecast the ARIMA model
for_MET_hi <- forecast(MET_hi_model_arima, h = length(MET_hi_v))
# Save the forecast as an xts object
dates_valid <- seq(as.Date("2017-01-01"), length = 22, by = "weeks")
for_MET_hi_xts <- xts(for_MET_hi$mean, order.by = dates_valid)
# Calculate the MAPE of the forecast
MAPE <- 100 * mean(abs(for_MET_hi_xts - MET_hi_v) / MET_hi_v)
print(MAPE)
## [1] 36.95411
# Ensemble the two forecasts together
for_MET_hi_en <- 0.5 * (for_MET_hi_xts + pred_MET_hi_xts)
# Calculate the MAE and MAPE
MAE <- mean(abs(for_MET_hi_en - MET_hi_v))
print(MAE)
## [1] 533.8911
MAPE <- 100 * mean(abs(for_MET_hi_en - MET_hi_v) / MET_hi_v)
print(MAPE)
## [1] 32.28549
Chapter 4 - Hierarchical Forecasting
Bottom-Up Hierarchical Forecasting:
Top-Down Hierarchical Forecasting:
Middle-Out Hierarchical Forecasting:
Wrap up:
Example code includes:
# Build a time series model
MET_sp_model_arima <- auto.arima(MET_sp)
# Forecast the time series model for 22 periods
for_MET_sp <- forecast(MET_sp_model_arima, h=22)
# Create an xts object
for_MET_sp_xts <- xts(for_MET_sp$mean, order.by=dates_valid)
MET_sp_v <- MET_sp["2017"]
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_MET_sp_xts - MET_sp_v) / MET_sp_v)
print(MAPE)
## [1] 6.703272
MET_sp_train <- bev_xts_train %>%
transform(log_sales = log(MET.sp), log_price=log(MET.sp.p))
MET_sp_train <- MET_sp_train[, c("log_sales", "log_price")]
MET_sp_train$newyear <- 0
MET_sp_train$valentine <- 0
MET_sp_train$christmas <- 0
MET_sp_train$mother <- 0
MET_sp_train[index(MET_sp_train) %in% as.Date(c("2014-12-28", "2015-12-27", "2016-12-25")), "newyear"] <- 1
MET_sp_train[index(MET_sp_train) %in% as.Date(c("2014-02-09", "2015-02-08", "2016-02-07")), "valentine"] <- 1
MET_sp_train[index(MET_sp_train) %in% as.Date(c("2014-12-21", "2015-12-20", "2016-12-18")), "christmas"] <- 1
MET_sp_train[index(MET_sp_train) %in% as.Date(c("2014-05-04", "2015-05-03", "2016-05-01")), "mother"] <- 1
# THE BELOW IS TOTAL NONSENSE
# Build a regression model
model_MET_sp <- lm(log_sales ~ log_price + newyear + valentine + christmas + mother, data = MET_sp_train)
MET_sp_valid <- as.data.frame(bev_xts_valid) %>%
mutate(log_sales = log(MET.sp), log_price=log(MET.sp.p)) %>%
select("log_sales", "log_price")
MET_sp_valid$newyear <- 0
MET_sp_valid$valentine <- 0
MET_sp_valid$christmas <- 0
MET_sp_valid$mother <- 0
MET_sp_valid[7, "valentine"] <- 1
MET_sp_valid[19, "mother"] <- 1
MET_sp_valid$log_sales <- NULL
# Forecast the regression model using the predict function
pred_MET_sp <- predict(model_MET_sp, MET_sp_valid)
# Exponentiate your predictions and create an xts object
pred_MET_sp <- exp(pred_MET_sp)
pred_MET_sp_xts <- xts(pred_MET_sp, order.by = dates_valid)
# Calculate MAPE
MAPE <- 100*mean(abs((pred_MET_sp_xts - MET_sp_v)/MET_sp_v))
print(MAPE)
## [1] 6.55473
# Ensemble the two forecasts
for_MET_sp_en <- 0.5 * (for_MET_sp_xts + pred_MET_sp_xts)
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_MET_sp_en - MET_sp_v) / MET_sp_v)
print(MAPE)
## [1] 6.048594
# Copy over pred_MET_lo_xts
pred_MET_lo_xts <- xts(c(2960.6, 2974.1, 2943.2, 2948.6, 2915.6, 2736.4, 2953.9, 3199.4, 2934, 2898.7, 3027.7, 3165.9, 3073.1, 2842.7, 2928.7, 3070.2, 2982.2, 3018, 3031.9, 2879.4, 2993.2, 2974.1), order.by=dates_valid)
# Calculate the metropolitan regional sales forecast
for_MET_total <- pred_MET_hi_xts + for_MET_sp_en + pred_MET_lo_xts
# Calculate a validation data set
MET_t_v <- bev_xts_valid[,"MET.hi"] + bev_xts_valid[,"MET.lo"] + bev_xts_valid[,"MET.sp"]
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_MET_total - MET_t_v) / MET_t_v)
print(MAPE)
## [1] 10.61952
# Create the MET_total data
MET_total <- xts(data.frame(MET.hi=c(5942, 5600, 5541, 6892, 5586, 5943, 6329, 6693, 6938, 6138, 6361, 6378, 5423, 5097, 4937, 5496, 6870, 6626, 6356, 5657, 6577, 7202, 7381, 7404, 7204, 6667, 6153, 6035, 5633, 5283, 5178, 4758, 5058, 5254, 5954, 6166, 6247, 6304, 7202, 6662, 6814, 6174, 5412, 5380, 5674, 6472, 6912, 7404, 8614, 8849, 7174, 6489, 7174, 6555, 6402, 7671, 5012, 4790, 5075, 5238, 5615, 6113, 7706, 7811, 7898, 7232, 6585, 5870, 7084, 5125, 5330, 5553, 6349, 6195, 6271, 5851, 5333, 5854, 5609, 5649, 6051, 6409, 5786, 5190, 5085, 4949, 5151, 5147, 5426, 5509, 6956, 7870, 8224, 6685, 6153, 5802, 5244, 5162, 5036, 5025, 8378, 8944, 7109, 7605, 7846, 7598, 8012, 9551, 6102, 5366, 4932, 4962, 5392, 6194, 7239, 7621, 7460, 7097, 6596, 5848, 8306, 5344, 5848, 6341, 7364, 7269, 7053, 6682, 6971, 7521, 7063, 6298, 6003, 5227, 5047, 4877, 4851, 4628, 4516, 4442, 4935, 5181, 5431, 5866, 5919, 5704, 5957, 6019, 5962, 6021, 5880, 5674, 7439, 7415)),
order.by=dates_train
)
# Build a regional time series model
MET_t_model_arima <- auto.arima(MET_total)
# Calculate a 2017 forecast for 22 periods
for_MET_t <- forecast(MET_t_model_arima, h=22)
# Make an xts object from your forecast
for_MET_t_xts <- xts(for_MET_t$mean, order.by=dates_valid)
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_MET_t_xts - MET_t_v) / MET_t_v)
print(MAPE)
## [1] 17.10332
# Calculate the average historical proportions
prop_hi <- mean(MET_hi/MET_total)
prop_lo <- mean(MET_lo/MET_total)
prop_sp <- mean(MET_sp/MET_total)
# Distribute out your forecast to each product
for_prop_hi <- prop_hi*for_MET_t_xts
for_prop_lo <- prop_lo*for_MET_t_xts
for_prop_sp <- prop_sp*for_MET_t_xts
# Calculate the MAPE's for each product
MAPE_hi <- 100 * mean(abs(for_prop_hi - MET_hi_v) / MET_hi_v)
print(MAPE_hi)
## [1] 38.7318
MET_lo_v <- bev_xts_valid[,"MET.lo"]
MAPE_lo <- 100 * mean(abs(for_prop_lo - MET_lo_v) / MET_lo_v)
print(MAPE_lo)
## [1] 10.70649
MAPE_sp <- 100 * mean(abs(for_prop_sp - MET_sp_v) / MET_sp_v)
print(MAPE_sp)
## [1] 6.232888
# Calculate the average historical proportions
prop_hi_2 <- mean(MET_hi) / mean(MET_total)
prop_lo_2 <- mean(MET_lo) / mean(MET_total)
prop_sp_2 <- mean(MET_sp) / mean(MET_total)
# Distribute out your forecast to each product
for_prop_hi_2 <- prop_hi_2 * for_MET_t_xts
for_prop_lo_2 <- prop_lo_2 * for_MET_t_xts
for_prop_sp_2 <- prop_sp_2 * for_MET_t_xts
# Calculate the MAPE's for each product
MAPE_hi <- 100 * mean(abs(for_prop_hi_2 - MET_hi_v) / MET_hi_v)
print(MAPE_hi)
## [1] 38.33559
MAPE_lo <- 100 * mean(abs(for_prop_lo_2 - MET_lo_v) / MET_lo_v)
print(MAPE_lo)
## [1] 8.450784
MAPE_sp <- 100 * mean(abs(for_prop_sp_2 - MET_sp_v) / MET_sp_v)
print(MAPE_sp)
## [1] 6.517045
SEC_total <- xts(data.frame(SEC.hi=c(700, 775, 789, 863, 765, 759, 757, 747, 746, 709, 749, 786, 796, 726, 727, 723, 778, 755, 739, 740, 723, 695, 727, 707, 725, 684, 667, 698, 727, 722, 748, 695, 742, 739, 715, 724, 686, 671, 688, 682, 710, 700, 672, 680, 695, 780, 751, 693, 809, 881, 703, 712, 768, 796, 808, 904, 641, 662, 693, 725, 719, 736, 715, 722, 732, 745, 689, 705, 811, 739, 744, 700, 745, 735, 732, 722, 721, 732, 750, 714, 752, 677, 731, 674, 720, 675, 741, 722, 715, 719, 649, 697, 743, 733, 772, 698, 690, 734, 713, 644, 788, 833, 749, 731, 670, 675, 675, 993, 773, 751, 697, 677, 750, 723, 780, 763, 721, 701, 704, 684, 985, 791, 731, 714, 704, 694, 685, 652, 708, 754, 747, 705, 711, 699, 712, 745, 706, 665, 666, 692, 676, 696, 689, 697, 689, 717, 697, 708, 660, 707, 715, 680, 922, 888)), order.by=dates_train
)
# Build a time series model for the region
SEC_t_model_arima <- auto.arima(SEC_total)
# Forecast the time series model
for_SEC_t <- forecast(SEC_t_model_arima, h=22)
# Make into an xts object
for_SEC_t_xts <- xts(for_SEC_t$mean, order.by=dates_valid)
SEC_t_v <- bev_xts_valid$SEC.hi + bev_xts_valid$SEC.lo
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_SEC_t_xts - SEC_t_v) / SEC_t_v)
print(MAPE)
## [1] 4.742324
SEC_hi <- bev_xts_train[, "SEC.hi"]
SEC_lo <- bev_xts_train[, "SEC.lo"]
SEC_hi_v <- bev_xts_valid[, "SEC.hi"]
SEC_lo_v <- bev_xts_valid[, "SEC.lo"]
# Calculate the average of historical proportions
prop_hi <- mean(SEC_hi / SEC_total)
prop_lo <- mean(SEC_lo / SEC_total)
# Distribute the forecast
for_prop_hi <- prop_hi * for_SEC_t_xts
for_prop_lo <- prop_lo * for_SEC_t_xts
# Calculate a MAPE for each product
MAPE_hi <- 100 * mean(abs(for_prop_hi - SEC_hi_v) / SEC_hi_v)
print(MAPE_hi)
## [1] 7.988508
MAPE_lo <- 100 * mean(abs(for_prop_lo - SEC_lo_v) / SEC_lo_v)
print(MAPE_lo)
## [1] 5.202529
# Copy over for_M_t_xts data
for_M_t_xts <- xts(c(2207, 2021, 2010, 2052, 2075, 2074, 2065, 2058, 2056, 2055, 2053, 2052, 2050, 2049, 2048, 2047, 2046, 2045, 2044, 2043, 2043, 2042), order.by=dates_valid)
# Calculate the state sales forecast: for_state
for_state = for_SEC_t_xts + for_MET_t_xts + for_M_t_xts
# See the forecasts
for_state
## [,1]
## 2017-01-01 9996.689
## 2017-01-08 9525.915
## 2017-01-15 9342.760
## 2017-01-22 9269.321
## 2017-01-29 9214.912
## 2017-02-05 9162.005
## 2017-02-12 9118.199
## 2017-02-19 9087.859
## 2017-02-26 9070.209
## 2017-03-05 9058.715
## 2017-03-12 9049.677
## 2017-03-19 9043.959
## 2017-03-26 9038.794
## 2017-04-02 9035.673
## 2017-04-09 9033.250
## 2017-04-16 9031.296
## 2017-04-23 9029.656
## 2017-04-30 9028.227
## 2017-05-07 9026.939
## 2017-05-14 9025.746
## 2017-05-21 9025.617
## 2017-05-28 9024.530
Chapter 1 - Identifying the Best Recruiting Source
Introduction - Ben Teusch, HR Analytics Consultant:
Recruiting and quality of hire:
Visualizing recruiting data:
Example code includes:
# Import the recruitment data
recruitment <- readr::read_csv("./RInputFiles/recruitment_data.csv")
## Parsed with column specification:
## cols(
## attrition = col_integer(),
## performance_rating = col_integer(),
## sales_quota_pct = col_double(),
## recruiting_source = col_character()
## )
# Look at the first few rows of the dataset
head(recruitment)
## # A tibble: 6 x 4
## attrition performance_rating sales_quota_pct recruiting_source
## <int> <int> <dbl> <chr>
## 1 1 3 1.09 Applied Online
## 2 0 3 2.39 <NA>
## 3 1 2 0.498 Campus
## 4 0 2 2.51 <NA>
## 5 0 3 1.42 Applied Online
## 6 1 3 0.548 Referral
# Get an overview of the recruitment data
summary(recruitment)
## attrition performance_rating sales_quota_pct recruiting_source
## Min. :0.000 Min. :1.000 Min. :-0.7108 Length:446
## 1st Qu.:0.000 1st Qu.:2.000 1st Qu.: 0.5844 Class :character
## Median :0.000 Median :3.000 Median : 1.0701 Mode :character
## Mean :0.213 Mean :2.895 Mean : 1.0826
## 3rd Qu.:0.000 3rd Qu.:3.000 3rd Qu.: 1.5325
## Max. :1.000 Max. :5.000 Max. : 3.6667
# See which recruiting sources the company has been using
recruitment %>%
count(recruiting_source)
## # A tibble: 5 x 2
## recruiting_source n
## <chr> <int>
## 1 Applied Online 130
## 2 Campus 56
## 3 Referral 45
## 4 Search Firm 10
## 5 <NA> 205
# Find the average sales quota attainment for each recruiting source
avg_sales <- recruitment %>%
group_by(recruiting_source) %>%
summarize(avg_sales_quota_pct=mean(sales_quota_pct))
# Display the result
avg_sales
## # A tibble: 5 x 2
## recruiting_source avg_sales_quota_pct
## <chr> <dbl>
## 1 Applied Online 1.06
## 2 Campus 0.908
## 3 Referral 1.02
## 4 Search Firm 0.887
## 5 <NA> 1.17
# Find the average attrition for the sales team, by recruiting source, sorted from lowest attrition rate to highest
avg_attrition <- recruitment %>%
group_by(recruiting_source) %>%
summarize(attrition_rate=mean(attrition)) %>%
arrange(attrition_rate)
# Display the result
avg_attrition
## # A tibble: 5 x 2
## recruiting_source attrition_rate
## <chr> <dbl>
## 1 <NA> 0.132
## 2 Applied Online 0.246
## 3 Campus 0.286
## 4 Referral 0.333
## 5 Search Firm 0.5
# Plot the bar chart
avg_sales %>% ggplot(aes(x=recruiting_source, y=avg_sales_quota_pct)) + geom_col()
# Plot the bar chart
avg_attrition %>% ggplot(aes(x=recruiting_source, y=attrition_rate)) + geom_col()
Chapter 2 - What is driving low employee engagement
Analyzing employee engagement:
Visualizing the engagement data:
Are differences meaningful?
Example code includes:
# Import the data
survey <- readr::read_csv("./RInputFiles/survey_data.csv")
## Parsed with column specification:
## cols(
## employee_id = col_integer(),
## department = col_character(),
## engagement = col_integer(),
## salary = col_double(),
## vacation_days_taken = col_integer()
## )
# Get an overview of the data
summary(survey)
## employee_id department engagement salary
## Min. : 1.0 Length:1470 Min. :1.00 Min. : 45530
## 1st Qu.: 491.2 Class :character 1st Qu.:3.00 1st Qu.: 59407
## Median :1020.5 Mode :character Median :3.00 Median : 70481
## Mean :1024.9 Mean :3.05 Mean : 74162
## 3rd Qu.:1555.8 3rd Qu.:4.00 3rd Qu.: 84763
## Max. :2068.0 Max. :5.00 Max. :164073
## vacation_days_taken
## Min. : 0.00
## 1st Qu.: 6.00
## Median :10.00
## Mean :11.27
## 3rd Qu.:16.00
## Max. :38.00
# Examine the counts of the department variable
survey %>% count(department)
## # A tibble: 3 x 2
## department n
## <chr> <int>
## 1 Engineering 961
## 2 Finance 63
## 3 Sales 446
# Output the average engagement score for each department, sorted
survey %>%
group_by(department) %>%
summarize(avg_engagement=mean(engagement)) %>%
arrange(avg_engagement)
## # A tibble: 3 x 2
## department avg_engagement
## <chr> <dbl>
## 1 Sales 2.81
## 2 Engineering 3.15
## 3 Finance 3.24
# Create the disengaged variable and assign the result to survey
survey_disengaged <- survey %>%
mutate(disengaged = ifelse(engagement <= 2, 1, 0))
survey_disengaged
## # A tibble: 1,470 x 6
## employee_id department engagement salary vacation_days_ta~ disengaged
## <int> <chr> <int> <dbl> <int> <dbl>
## 1 1 Sales 3 103264. 7 0
## 2 2 Engineering 3 80709. 12 0
## 3 4 Engineering 3 60737. 12 0
## 4 5 Engineering 3 99116. 7 0
## 5 7 Engineering 3 51022. 18 0
## 6 8 Engineering 3 98400. 9 0
## 7 10 Engineering 3 57106. 18 0
## 8 11 Engineering 1 55065. 4 1
## 9 12 Engineering 4 77158. 12 0
## 10 13 Engineering 2 48365. 14 1
## # ... with 1,460 more rows
# Summarize the three variables by department
survey_summary <- survey_disengaged %>%
group_by(department) %>%
summarize(pct_disengaged=mean(disengaged),
avg_salary=mean(salary),
avg_vacation_taken=mean(vacation_days_taken)
)
survey_summary
## # A tibble: 3 x 4
## department pct_disengaged avg_salary avg_vacation_taken
## <chr> <dbl> <dbl> <dbl>
## 1 Engineering 0.206 73576. 12.2
## 2 Finance 0.190 76652. 11.5
## 3 Sales 0.330 75074. 9.22
# Gather data for plotting
survey_gathered <- survey_summary %>%
gather(key = "measure", value = "value",
pct_disengaged, avg_salary, avg_vacation_taken)
# Create three bar charts
ggplot(survey_gathered, aes(x=measure, y=value, fill=department)) +
geom_col(position="dodge") +
facet_wrap(~ measure, scales="free")
# Add the in_sales variable
survey_sales <- survey %>%
mutate(in_sales = ifelse(department == "Sales", "Sales", "Other"),
disengaged = ifelse(engagement < 3, 1L, 0L)
)
# Test the hypothesis using survey_sales
chisq.test(survey_sales$disengaged, survey_sales$in_sales)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: survey_sales$disengaged and survey_sales$in_sales
## X-squared = 25.524, df = 1, p-value = 4.368e-07
t.test(disengaged ~ in_sales, data=survey_sales)
##
## Welch Two Sample t-test
##
## data: disengaged by in_sales
## t = -4.862, df = 743.16, p-value = 1.419e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.17479596 -0.07424062
## sample estimates:
## mean in group Other mean in group Sales
## 0.2050781 0.3295964
# Test the hypothesis using the survey_sales data
t.test(vacation_days_taken ~ in_sales, data = survey_sales)
##
## Welch Two Sample t-test
##
## data: vacation_days_taken by in_sales
## t = 8.1549, df = 1022.9, p-value = 1.016e-15
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.229473 3.642409
## sample estimates:
## mean in group Other mean in group Sales
## 12.160156 9.224215
Chapter 3 - Are new hires getting paid too much?
Paying new hires fairly:
Omitted variable bias:
Linear regression helps to test the multivariate impacts of variables:
Example code includes:
# Import the data
pay <- readr::read_csv("./RInputFiles/fair_pay_data.csv")
## Parsed with column specification:
## cols(
## employee_id = col_integer(),
## department = col_character(),
## salary = col_double(),
## new_hire = col_character(),
## job_level = col_character()
## )
# Get an overview of the data
summary(pay)
## employee_id department salary new_hire
## Min. : 1.0 Length:1470 Min. : 43820 Length:1470
## 1st Qu.: 491.2 Class :character 1st Qu.: 59378 Class :character
## Median :1020.5 Mode :character Median : 70425 Mode :character
## Mean :1024.9 Mean : 74142
## 3rd Qu.:1555.8 3rd Qu.: 84809
## Max. :2068.0 Max. :164073
## job_level
## Length:1470
## Class :character
## Mode :character
##
##
##
# Check average salary of new hires and non-new hires
pay %>%
group_by(new_hire) %>%
summarize(avg_salary=mean(salary))
## # A tibble: 2 x 2
## new_hire avg_salary
## <chr> <dbl>
## 1 No 73425.
## 2 Yes 76074.
# Perform the correct statistical test
t.test(salary ~ new_hire, data = pay)
##
## Welch Two Sample t-test
##
## data: salary by new_hire
## t = -2.3437, df = 685.16, p-value = 0.01938
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4869.4242 -429.9199
## sample estimates:
## mean in group No mean in group Yes
## 73424.60 76074.28
t.test(salary ~ new_hire, data = pay) %>%
broom::tidy()
## # A tibble: 1 x 10
## estimate estimate1 estimate2 statistic p.value parameter conf.low
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -2650. 73425. 76074. -2.34 0.0194 685. -4869.
## # ... with 3 more variables: conf.high <dbl>, method <chr>,
## # alternative <chr>
# Create a stacked bar chart
pay %>%
ggplot(aes(x=new_hire, fill=job_level)) +
geom_bar(position="fill")
# Calculate the average salary for each group of interest
pay_grouped <- pay %>%
group_by(new_hire, job_level) %>%
summarize(avg_salary = mean(salary))
# Graph the results using facet_wrap()
pay_grouped %>%
ggplot(aes(x=new_hire, y=avg_salary)) +
geom_col() +
facet_wrap(~ job_level)
# Filter the data to include only hourly employees
pay_filter <- pay %>%
filter(job_level == "Hourly")
# Test the difference in pay
t.test(salary ~ new_hire, data=pay_filter) %>%
broom::tidy()
## # A tibble: 1 x 10
## estimate estimate1 estimate2 statistic p.value parameter conf.low
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -1107. 63966. 65073. -1.75 0.0807 500. -2349.
## # ... with 3 more variables: conf.high <dbl>, method <chr>,
## # alternative <chr>
# Run the simple regression
model_simple <- lm(salary ~ new_hire, data = pay)
# Display the summary of model_simple
model_simple %>%
summary()
##
## Call:
## lm(formula = salary ~ new_hire, data = pay)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32255 -14466 -3681 10740 87998
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73424.6 577.2 127.200 <2e-16 ***
## new_hireYes 2649.7 1109.4 2.388 0.017 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18900 on 1468 degrees of freedom
## Multiple R-squared: 0.003871, Adjusted R-squared: 0.003193
## F-statistic: 5.705 on 1 and 1468 DF, p-value: 0.01704
# Display a tidy summary
model_simple %>%
broom::tidy()
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 73425. 577. 127. 0
## 2 new_hireYes 2650. 1109. 2.39 0.0170
# Run the multiple regression
model_multiple <- lm(salary ~ new_hire + job_level, data = pay)
# Display the summary of model_multiple
model_multiple %>%
summary()
##
## Call:
## lm(formula = salary ~ new_hire + job_level, data = pay)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21013 -7091 -425 6771 44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64049.3 308.3 207.722 <2e-16 ***
## new_hireYes 782.7 524.8 1.491 0.136
## job_levelManager 54918.8 915.3 60.001 <2e-16 ***
## job_levelSalaried 26865.6 567.2 47.369 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8930 on 1466 degrees of freedom
## Multiple R-squared: 0.7779, Adjusted R-squared: 0.7775
## F-statistic: 1712 on 3 and 1466 DF, p-value: < 2.2e-16
# Display a tidy summary
model_multiple %>%
broom::tidy()
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 64049. 308. 208. 0.
## 2 new_hireYes 783. 525. 1.49 1.36e- 1
## 3 job_levelManager 54919. 915. 60.0 0.
## 4 job_levelSalaried 26866. 567. 47.4 7.39e-298
Chapter 4 - Are performance ratings being given consistently?
Joining HR data:
Performance ratings and fairness:
Logistic regression is especially helpful for modeling binary response variables:
Example code includes:
# Import the data
hr_data <- readr::read_csv("./RInputFiles/hr_data.csv")
## Parsed with column specification:
## cols(
## employee_id = col_integer(),
## department = col_character(),
## job_level = col_character(),
## gender = col_character()
## )
performance_data <- readr::read_csv("./RInputFiles/performance_data.csv")
## Parsed with column specification:
## cols(
## employee_id = col_integer(),
## rating = col_integer()
## )
# Examine the datasets
summary(hr_data)
## employee_id department job_level gender
## Min. : 1.0 Length:1470 Length:1470 Length:1470
## 1st Qu.: 491.2 Class :character Class :character Class :character
## Median :1020.5 Mode :character Mode :character Mode :character
## Mean :1024.9
## 3rd Qu.:1555.8
## Max. :2068.0
summary(performance_data)
## employee_id rating
## Min. : 1.0 Min. :1.00
## 1st Qu.: 491.2 1st Qu.:2.00
## Median :1020.5 Median :3.00
## Mean :1024.9 Mean :2.83
## 3rd Qu.:1555.8 3rd Qu.:4.00
## Max. :2068.0 Max. :5.00
# Join the two tables
joined_data <- left_join(hr_data, performance_data, by = "employee_id")
# Examine the result
summary(joined_data)
## employee_id department job_level gender
## Min. : 1.0 Length:1470 Length:1470 Length:1470
## 1st Qu.: 491.2 Class :character Class :character Class :character
## Median :1020.5 Mode :character Mode :character Mode :character
## Mean :1024.9
## 3rd Qu.:1555.8
## Max. :2068.0
## rating
## Min. :1.00
## 1st Qu.:2.00
## Median :3.00
## Mean :2.83
## 3rd Qu.:4.00
## Max. :5.00
# Check whether the average performance rating differs by gender
joined_data %>%
group_by(gender) %>%
summarize(avg_rating = mean(rating))
## # A tibble: 2 x 2
## gender avg_rating
## <chr> <dbl>
## 1 Female 2.75
## 2 Male 2.92
# Add the high_performer column
performance <- joined_data %>%
mutate(high_performer = ifelse(rating >= 4, 1, 0))
# Test whether one gender is more likely to be a high performer
chisq.test(performance$gender, performance$high_performer)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: performance$gender and performance$high_performer
## X-squared = 22.229, df = 1, p-value = 2.42e-06
# Do the same test, and tidy the output
chisq.test(performance$gender, performance$high_performer) %>% broom::tidy()
## # A tibble: 1 x 4
## statistic p.value parameter method
## <dbl> <dbl> <int> <chr>
## 1 22.2 2.42e-6 1 Pearson's Chi-squared test with Yates' co~
# Visualize the distribution of high_performer by gender
performance %>%
ggplot(aes(x=gender, fill=factor(high_performer))) +
geom_bar(position="fill")
# Visualize the distribution of all ratings by gender
performance %>%
ggplot(aes(x=gender, fill=factor(rating))) +
geom_bar(position="fill")
# Visualize the distribution of job_level by gender
performance %>%
ggplot(aes(x = gender, fill = job_level)) +
geom_bar(position = "fill")
# Test whether men and women have different job level distributions
chisq.test(performance$gender, performance$job_level)
##
## Pearson's Chi-squared test
##
## data: performance$gender and performance$job_level
## X-squared = 44.506, df = 2, p-value = 2.166e-10
# Visualize the distribution of high_performer by gender, faceted by job level
performance %>%
ggplot(aes(x = gender, fill = factor(high_performer))) +
geom_bar(position = "fill") +
facet_wrap(~ job_level)
# Run a simple logistic regression
logistic_simple <- glm(high_performer ~ gender, family = "binomial", data = performance)
# View the result with summary()
logistic_simple %>%
summary()
##
## Call:
## glm(formula = high_performer ~ gender, family = "binomial", data = performance)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8871 -0.8871 -0.6957 1.4986 1.7535
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.29540 0.08813 -14.699 < 2e-16 ***
## genderMale 0.56596 0.11921 4.748 2.06e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1709.0 on 1469 degrees of freedom
## Residual deviance: 1686.1 on 1468 degrees of freedom
## AIC: 1690.1
##
## Number of Fisher Scoring iterations: 4
# View a tidy version of the result
logistic_simple %>%
broom::tidy()
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.30 0.0881 -14.7 6.58e-49
## 2 genderMale 0.566 0.119 4.75 2.06e- 6
# Run a multiple logistic regression
logistic_multiple <- glm(high_performer ~ gender + job_level, family = "binomial", data = performance)
# View the result with summary() or tidy()
logistic_multiple %>% broom::tidy()
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.69 0.103 -16.5 2.74e-61
## 2 genderMale 0.319 0.129 2.47 1.34e- 2
## 3 job_levelManager 2.74 0.251 10.9 1.01e-27
## 4 job_levelSalaried 1.10 0.141 7.82 5.17e-15
Chapter 5 - Improving employee safety with data
Employee safety - looking at accident rates and drivers:
Focusing on the location of interest:
Explaining the increase in accidents:
Wrap up:
Example code includes:
# Import the data
hr_data <- readr::read_csv("./RInputFiles/hr_data_2.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## employee_id = col_integer(),
## location = col_character(),
## overtime_hours = col_integer()
## )
accident_data <- readr::read_csv("./RInputFiles/accident_data.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## employee_id = col_integer(),
## accident_type = col_character()
## )
# Create hr_joined with left_join() and mutate()
hr_joined <- left_join(hr_data, accident_data, by=c("year", "employee_id")) %>%
mutate(had_accident=ifelse(is.na(accident_type), 0, 1))
hr_joined
## # A tibble: 2,940 x 6
## year employee_id location overtime_hours accident_type had_accident
## <int> <int> <chr> <int> <chr> <dbl>
## 1 2016 1 Northwood 14 <NA> 0
## 2 2017 1 Northwood 8 Mild 1
## 3 2016 2 East Valley 8 <NA> 0
## 4 2017 2 East Valley 11 <NA> 0
## 5 2016 4 East Valley 4 <NA> 0
## 6 2017 4 East Valley 2 Mild 1
## 7 2016 5 West River 0 <NA> 0
## 8 2017 5 West River 17 <NA> 0
## 9 2016 7 West River 21 <NA> 0
## 10 2017 7 West River 9 <NA> 0
## # ... with 2,930 more rows
# Find accident rate for each year
hr_joined %>%
group_by(year) %>%
summarize(accident_rate = mean(had_accident))
## # A tibble: 2 x 2
## year accident_rate
## <int> <dbl>
## 1 2016 0.0850
## 2 2017 0.120
# Test difference in accident rate between years
chisq.test(hr_joined$year, hr_joined$had_accident)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: hr_joined$year and hr_joined$had_accident
## X-squared = 9.5986, df = 1, p-value = 0.001947
# Which location had the highest acccident rate?
hr_joined %>%
group_by(location) %>%
summarize(accident_rate=mean(had_accident)) %>%
arrange(-accident_rate)
## # A tibble: 4 x 2
## location accident_rate
## <chr> <dbl>
## 1 East Valley 0.128
## 2 Southfield 0.103
## 3 West River 0.0961
## 4 Northwood 0.0831
# Compare annual accident rates by location
accident_rates <- hr_joined %>%
group_by(location, year) %>%
summarize(accident_rate = mean(had_accident))
accident_rates
## # A tibble: 8 x 3
## # Groups: location [?]
## location year accident_rate
## <chr> <int> <dbl>
## 1 East Valley 2016 0.113
## 2 East Valley 2017 0.143
## 3 Northwood 2016 0.0644
## 4 Northwood 2017 0.102
## 5 Southfield 2016 0.0764
## 6 Southfield 2017 0.130
## 7 West River 2016 0.0824
## 8 West River 2017 0.110
# Graph it
accident_rates %>%
ggplot(aes(factor(year), accident_rate)) +
geom_col() +
facet_wrap(~location)
# Filter out the other locations
southfield <- hr_joined %>%
filter(location == "Southfield")
# Find the average overtime hours worked by year
southfield %>%
group_by(year) %>%
summarize(average_overtime_hours = mean(overtime_hours))
## # A tibble: 2 x 2
## year average_overtime_hours
## <int> <dbl>
## 1 2016 8.67
## 2 2017 9.97
# Test difference in Southfield's overtime hours between years
t.test(overtime_hours ~ year, data=southfield)
##
## Welch Two Sample t-test
##
## data: overtime_hours by year
## t = -1.6043, df = 595.46, p-value = 0.1092
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.904043 0.292747
## sample estimates:
## mean in group 2016 mean in group 2017
## 8.667774 9.973422
# Import the survey data
survey_data <- readr::read_csv("./RInputFiles/survey_data_2.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## employee_id = col_integer(),
## engagement = col_integer()
## )
# Create the safety dataset
safety <- left_join(hr_joined, survey_data, by=c("employee_id", "year")) %>%
mutate(disengaged=ifelse(engagement <= 2, 1, 0), year=factor(year))
# Visualize the difference in % disengaged by year in Southfield
safety %>%
filter(location=="Southfield") %>%
ggplot(aes(x = year, fill = factor(disengaged))) +
geom_bar(position = "fill")
# Test whether one year had significantly more disengaged employees
southSafety <- safety %>%
filter(location=="Southfield")
chisq.test(southSafety$disengaged, southSafety$year)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: southSafety$disengaged and southSafety$year
## X-squared = 7.1906, df = 1, p-value = 0.007329
# Filter out Southfield
other_locs <- safety %>%
filter(location != "Southfield")
# Test whether one year had significantly more overtime hours worked
t.test(overtime_hours ~ year, data = other_locs)
##
## Welch Two Sample t-test
##
## data: overtime_hours by year
## t = -0.48267, df = 2320.3, p-value = 0.6294
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.9961022 0.6026035
## sample estimates:
## mean in group 2016 mean in group 2017
## 9.278015 9.474765
# Test whether one year had significantly more disengaged employees
chisq.test(other_locs$year, other_locs$disengaged)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: other_locs$year and other_locs$disengaged
## X-squared = 0.0023091, df = 1, p-value = 0.9617
# Use multiple regression to test the impact of year and disengaged on accident rate in Southfield
regression <- glm(had_accident ~ year + disengaged, family = "binomial", data = southSafety)
# Examine the output
regression %>% broom::tidy()
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -2.92 0.250 -11.7 1.74e-31
## 2 year2017 0.440 0.285 1.55 1.22e- 1
## 3 disengaged 1.44 0.278 5.19 2.13e- 7
Chapter 1 - Cars Data
Making predictions using machine learning:
Getting started with caret:
Sampling data:
Example code includes:
cars2018 <- readr::read_csv("./RInputFiles/cars2018.csv")
## Parsed with column specification:
## cols(
## Model = col_character(),
## `Model Index` = col_integer(),
## Displacement = col_double(),
## Cylinders = col_integer(),
## Gears = col_integer(),
## Transmission = col_character(),
## MPG = col_integer(),
## Aspiration = col_character(),
## `Lockup Torque Converter` = col_character(),
## Drive = col_character(),
## `Max Ethanol` = col_integer(),
## `Recommended Fuel` = col_character(),
## `Intake Valves Per Cyl` = col_integer(),
## `Exhaust Valves Per Cyl` = col_integer(),
## `Fuel injection` = col_character()
## )
str(cars2018, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1144 obs. of 15 variables:
## $ Model : chr "Acura NSX" "ALFA ROMEO 4C" "Audi R8 AWD" "Audi R8 RWD" ...
## $ Model Index : int 57 410 65 71 66 72 46 488 38 278 ...
## $ Displacement : num 3.5 1.8 5.2 5.2 5.2 5.2 2 3 8 6.2 ...
## $ Cylinders : int 6 4 10 10 10 10 4 6 16 8 ...
## $ Gears : int 9 6 7 7 7 7 6 7 7 8 ...
## $ Transmission : chr "Manual" "Manual" "Manual" "Manual" ...
## $ MPG : int 21 28 17 18 17 18 26 20 11 18 ...
## $ Aspiration : chr "Turbocharged/Supercharged" "Turbocharged/Supercharged" "Naturally Aspirated" "Naturally Aspirated" ...
## $ Lockup Torque Converter: chr "Y" "Y" "Y" "Y" ...
## $ Drive : chr "All Wheel Drive" "2-Wheel Drive, Rear" "All Wheel Drive" "2-Wheel Drive, Rear" ...
## $ Max Ethanol : int 10 10 15 15 15 15 15 10 15 10 ...
## $ Recommended Fuel : chr "Premium Unleaded Required" "Premium Unleaded Required" "Premium Unleaded Recommended" "Premium Unleaded Recommended" ...
## $ Intake Valves Per Cyl : int 2 2 2 2 2 2 2 2 2 1 ...
## $ Exhaust Valves Per Cyl : int 2 2 2 2 2 2 2 2 2 1 ...
## $ Fuel injection : chr "Direct ignition" "Direct ignition" "Direct ignition" "Direct ignition" ...
summary(cars2018)
## Model Model Index Displacement Cylinders
## Length:1144 Min. : 1.0 Min. :1.000 Min. : 3.000
## Class :character 1st Qu.: 36.0 1st Qu.:2.000 1st Qu.: 4.000
## Mode :character Median :108.0 Median :3.000 Median : 6.000
## Mean :201.3 Mean :3.087 Mean : 5.564
## 3rd Qu.:323.8 3rd Qu.:3.600 3rd Qu.: 6.000
## Max. :821.0 Max. :8.000 Max. :16.000
## Gears Transmission MPG Aspiration
## Min. : 1.000 Length:1144 Min. :11.0 Length:1144
## 1st Qu.: 6.000 Class :character 1st Qu.:19.0 Class :character
## Median : 7.000 Mode :character Median :23.0 Mode :character
## Mean : 6.935 Mean :23.2
## 3rd Qu.: 8.000 3rd Qu.:26.0
## Max. :10.000 Max. :58.0
## Lockup Torque Converter Drive Max Ethanol
## Length:1144 Length:1144 Min. :10.00
## Class :character Class :character 1st Qu.:10.00
## Mode :character Mode :character Median :10.00
## Mean :15.29
## 3rd Qu.:15.00
## Max. :85.00
## Recommended Fuel Intake Valves Per Cyl Exhaust Valves Per Cyl
## Length:1144 Min. :1.000 Min. :1.000
## Class :character 1st Qu.:2.000 1st Qu.:2.000
## Mode :character Median :2.000 Median :2.000
## Mean :1.926 Mean :1.922
## 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :2.000 Max. :2.000
## Fuel injection
## Length:1144
## Class :character
## Mode :character
##
##
##
# Print the cars2018 object
cars2018
## # A tibble: 1,144 x 15
## Model `Model Index` Displacement Cylinders Gears Transmission MPG
## <chr> <int> <dbl> <int> <int> <chr> <int>
## 1 Acura NSX 57 3.50 6 9 Manual 21
## 2 ALFA ROM~ 410 1.80 4 6 Manual 28
## 3 Audi R8 ~ 65 5.20 10 7 Manual 17
## 4 Audi R8 ~ 71 5.20 10 7 Manual 18
## 5 Audi R8 ~ 66 5.20 10 7 Manual 17
## 6 Audi R8 ~ 72 5.20 10 7 Manual 18
## 7 Audi TT ~ 46 2.00 4 6 Manual 26
## 8 BMW M4 D~ 488 3.00 6 7 Manual 20
## 9 Bugatti ~ 38 8.00 16 7 Manual 11
## 10 Chevrole~ 278 6.20 8 8 Automatic 18
## # ... with 1,134 more rows, and 8 more variables: Aspiration <chr>,
## # `Lockup Torque Converter` <chr>, Drive <chr>, `Max Ethanol` <int>,
## # `Recommended Fuel` <chr>, `Intake Valves Per Cyl` <int>, `Exhaust
## # Valves Per Cyl` <int>, `Fuel injection` <chr>
# Plot the histogram
ggplot(cars2018, aes(x = MPG)) +
geom_histogram(bins = 25) +
labs(y = "Number of cars",
x = "Fuel efficiency (mpg)")
# Deselect the 2 columns to create cars_vars
cars_vars <- cars2018 %>%
select(-Model, -`Model Index`)
# Fit a linear model
fit_all <- lm(MPG ~ ., data = cars_vars)
# Print the summary of the model
summary(fit_all)
##
## Call:
## lm(formula = MPG ~ ., data = cars_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5261 -1.6473 -0.1096 1.3572 26.5045
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 44.539519 1.176283
## Displacement -3.786147 0.264845
## Cylinders 0.520284 0.161802
## Gears 0.157674 0.069984
## TransmissionCVT 4.877637 0.404051
## TransmissionManual -1.074608 0.366075
## AspirationTurbocharged/Supercharged -2.190248 0.267559
## `Lockup Torque Converter`Y -2.624494 0.381252
## Drive2-Wheel Drive, Rear -2.676716 0.291044
## Drive4-Wheel Drive -3.397532 0.335147
## DriveAll Wheel Drive -2.941084 0.257174
## `Max Ethanol` -0.007377 0.005898
## `Recommended Fuel`Premium Unleaded Required -0.403935 0.262413
## `Recommended Fuel`Regular Unleaded Recommended -0.996343 0.272495
## `Intake Valves Per Cyl` -1.446107 1.620575
## `Exhaust Valves Per Cyl` -2.469747 1.547748
## `Fuel injection`Multipoint/sequential ignition -0.658428 0.243819
## t value Pr(>|t|)
## (Intercept) 37.865 < 2e-16 ***
## Displacement -14.296 < 2e-16 ***
## Cylinders 3.216 0.001339 **
## Gears 2.253 0.024450 *
## TransmissionCVT 12.072 < 2e-16 ***
## TransmissionManual -2.935 0.003398 **
## AspirationTurbocharged/Supercharged -8.186 7.24e-16 ***
## `Lockup Torque Converter`Y -6.884 9.65e-12 ***
## Drive2-Wheel Drive, Rear -9.197 < 2e-16 ***
## Drive4-Wheel Drive -10.137 < 2e-16 ***
## DriveAll Wheel Drive -11.436 < 2e-16 ***
## `Max Ethanol` -1.251 0.211265
## `Recommended Fuel`Premium Unleaded Required -1.539 0.124010
## `Recommended Fuel`Regular Unleaded Recommended -3.656 0.000268 ***
## `Intake Valves Per Cyl` -0.892 0.372400
## `Exhaust Valves Per Cyl` -1.596 0.110835
## `Fuel injection`Multipoint/sequential ignition -2.700 0.007028 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.916 on 1127 degrees of freedom
## Multiple R-squared: 0.7314, Adjusted R-squared: 0.7276
## F-statistic: 191.8 on 16 and 1127 DF, p-value: < 2.2e-16
# Load caret
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# Split the data into training and test sets
set.seed(1234)
in_train <- createDataPartition(cars_vars$Transmission, p = 0.8, list = FALSE)
training <- cars_vars[in_train, ]
testing <- cars_vars[-in_train, ]
# Train a linear regression model
fit_lm <- train(log(MPG) ~ ., method = "lm", data = training,
trControl = trainControl(method = "none"))
# Print the model object
fit_lm
## Linear Regression
##
## 916 samples
## 12 predictor
##
## No pre-processing
## Resampling: None
# Train a random forest model
fit_rf <- train(log(MPG) ~ ., method = "rf", data = training,
trControl = trainControl(method = "none"))
# Print the model object
fit_rf
## Random Forest
##
## 916 samples
## 12 predictor
##
## No pre-processing
## Resampling: None
# Create the new columns
results <- training %>%
mutate(`Linear regression` = predict(fit_lm, training),
`Random forest` = predict(fit_rf, training))
# Evaluate the performance
yardstick::metrics(results, truth = MPG, estimate = `Linear regression`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.9 0.702
yardstick::metrics(results, truth = MPG, estimate = `Random forest`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.9 0.845
# Create the new columns
results <- testing %>%
mutate(`Linear regression` = predict(fit_lm, testing),
`Random forest` = predict(fit_rf, testing))
# Evaluate the performance
yardstick::metrics(results, truth = MPG, estimate = `Linear regression`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.5 0.799
yardstick::metrics(results, truth = MPG, estimate = `Random forest`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.5 0.880
# Fit the models with bootstrap resampling
cars_lm_bt <- train(log(MPG) ~ ., method = "lm", data = training,
trControl = trainControl(method = "boot"))
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
cars_rf_bt <- train(log(MPG) ~ ., method = "rf", data = training,
trControl = trainControl(method = "boot"))
# Quick look at the models
cars_lm_bt
## Linear Regression
##
## 916 samples
## 12 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 916, 916, 916, 916, 916, 916, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.1036278 0.7890514 0.07656104
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
cars_rf_bt
## Random Forest
##
## 916 samples
## 12 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 916, 916, 916, 916, 916, 916, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 0.10015480 0.8205322 0.07299305
## 9 0.08758544 0.8466598 0.06129895
## 16 0.09100659 0.8360034 0.06313542
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 9.
results <- testing %>%
mutate(`Linear regression` = predict(cars_lm_bt, testing),
`Random forest` = predict(cars_rf_bt, testing))
yardstick::metrics(results, truth = MPG, estimate = `Linear regression`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.5 0.799
yardstick::metrics(results, truth = MPG, estimate = `Random forest`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.5 0.903
results %>%
gather(Method, Result, `Linear regression`:`Random forest`) %>%
ggplot(aes(log(MPG), Result, color = Method)) +
geom_point(size = 1.5, alpha = 0.5) +
facet_wrap(~Method) +
geom_abline(lty = 2, color = "gray50") +
geom_smooth(method = "lm")
Chapter 2 - Stack Overflow Developer Data
Essential copying and pasting from Stack Overflow (largest and most trusted developer community):
Dealing with imbalanced data:
Predicting remote status:
Logistic regression)Logistic regression)Logistic regression)Example code includes:
stackoverflow <- readr::read_csv("./RInputFiles/stackoverflow.csv")
## Parsed with column specification:
## cols(
## .default = col_logical(),
## Respondent = col_integer(),
## Country = col_character(),
## Salary = col_double(),
## YearsCodedJob = col_integer(),
## CompanySizeNumber = col_double(),
## Remote = col_character(),
## CareerSatisfaction = col_integer()
## )
## See spec(...) for full column specifications.
stackoverflow$Remote <- factor(stackoverflow$Remote, levels=c("Not remote", "Remote"))
str(stackoverflow, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 6991 obs. of 22 variables:
## $ Respondent : int 3 15 18 19 26 55 62 71 73 77 ...
## $ Country : chr "United Kingdom" "United Kingdom" "United States" "United States" ...
## $ Salary : num 113750 100000 130000 82500 175000 ...
## $ YearsCodedJob : int 20 20 20 3 16 4 1 1 20 20 ...
## $ OpenSource : logi TRUE FALSE TRUE FALSE FALSE FALSE ...
## $ Hobby : logi TRUE TRUE TRUE TRUE TRUE FALSE ...
## $ CompanySizeNumber : num 10000 5000 1000 10000 10000 1000 5000 20 100 1000 ...
## $ Remote : Factor w/ 2 levels "Not remote","Remote": 1 2 2 1 1 1 1 1 2 2 ...
## $ CareerSatisfaction : int 8 8 9 5 7 9 5 8 8 10 ...
## $ Data scientist : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Database administrator : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Desktop applications developer : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Developer with stats/math background: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ DevOps : logi FALSE FALSE TRUE FALSE FALSE FALSE ...
## $ Embedded developer : logi FALSE TRUE TRUE FALSE FALSE FALSE ...
## $ Graphic designer : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Graphics programming : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Machine learning specialist : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Mobile developer : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Quality assurance engineer : logi FALSE FALSE TRUE FALSE FALSE FALSE ...
## $ Systems administrator : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Web developer : logi FALSE FALSE TRUE TRUE TRUE TRUE ...
# Print stackoverflow
stackoverflow
## # A tibble: 6,991 x 22
## Respondent Country Salary YearsCodedJob OpenSource Hobby
## <int> <chr> <dbl> <int> <lgl> <lgl>
## 1 3 United Kingdom 113750 20 T T
## 2 15 United Kingdom 100000 20 F T
## 3 18 United States 130000 20 T T
## 4 19 United States 82500 3 F T
## 5 26 United States 175000 16 F T
## 6 55 Germany 64516 4 F F
## 7 62 India 6636 1 F T
## 8 71 United States 65000 1 F T
## 9 73 United States 120000 20 T T
## 10 77 United States 96283 20 T T
## # ... with 6,981 more rows, and 16 more variables:
## # CompanySizeNumber <dbl>, Remote <fct>, CareerSatisfaction <int>, `Data
## # scientist` <lgl>, `Database administrator` <lgl>, `Desktop
## # applications developer` <lgl>, `Developer with stats/math
## # background` <lgl>, DevOps <lgl>, `Embedded developer` <lgl>, `Graphic
## # designer` <lgl>, `Graphics programming` <lgl>, `Machine learning
## # specialist` <lgl>, `Mobile developer` <lgl>, `Quality assurance
## # engineer` <lgl>, `Systems administrator` <lgl>, `Web developer` <lgl>
# First count for Remote
stackoverflow %>%
count(Remote, sort = TRUE)
## # A tibble: 2 x 2
## Remote n
## <fct> <int>
## 1 Not remote 6273
## 2 Remote 718
# then count for Country
stackoverflow %>%
count(Country, sort = TRUE)
## # A tibble: 5 x 2
## Country n
## <chr> <int>
## 1 United States 3486
## 2 United Kingdom 1270
## 3 Germany 950
## 4 India 666
## 5 Canada 619
ggplot(stackoverflow, aes(x=Remote, y=YearsCodedJob)) +
geom_boxplot() +
labs(x = NULL,
y = "Years of professional coding experience")
# Build a simple logistic regression model
simple_glm <- stackoverflow %>%
select(-Respondent) %>%
glm(Remote ~ .,
family = "binomial",
data = .)
# Print the summary of the model
summary(simple_glm)
##
## Call:
## glm(formula = Remote ~ ., family = "binomial", data = .)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1942 -0.4971 -0.3824 -0.2867 2.9118
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -4.156e+00 2.929e-01 -14.187
## CountryGermany -2.034e-01 2.196e-01 -0.927
## CountryIndia 9.574e-01 2.220e-01 4.312
## CountryUnited Kingdom 5.599e-02 1.974e-01 0.284
## CountryUnited States 5.990e-01 1.799e-01 3.330
## Salary 4.076e-06 1.589e-06 2.565
## YearsCodedJob 7.133e-02 7.556e-03 9.440
## OpenSourceTRUE 4.207e-01 8.555e-02 4.917
## HobbyTRUE 8.330e-03 9.827e-02 0.085
## CompanySizeNumber -6.104e-05 1.223e-05 -4.990
## CareerSatisfaction 6.748e-02 2.664e-02 2.533
## `Data scientist`TRUE -1.186e-01 1.838e-01 -0.645
## `Database administrator`TRUE 2.763e-01 1.267e-01 2.181
## `Desktop applications developer`TRUE -2.903e-01 9.842e-02 -2.950
## `Developer with stats/math background`TRUE 2.840e-02 1.359e-01 0.209
## DevOpsTRUE -1.532e-01 1.292e-01 -1.185
## `Embedded developer`TRUE -2.777e-01 1.653e-01 -1.680
## `Graphic designer`TRUE -1.904e-01 2.725e-01 -0.699
## `Graphics programming`TRUE 1.078e-01 2.312e-01 0.466
## `Machine learning specialist`TRUE -2.289e-01 2.769e-01 -0.827
## `Mobile developer`TRUE 2.170e-01 1.019e-01 2.130
## `Quality assurance engineer`TRUE -2.826e-01 2.437e-01 -1.160
## `Systems administrator`TRUE 1.462e-01 1.421e-01 1.029
## `Web developer`TRUE 1.158e-01 9.993e-02 1.159
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## CountryGermany 0.354161
## CountryIndia 1.62e-05 ***
## CountryUnited Kingdom 0.776710
## CountryUnited States 0.000868 ***
## Salary 0.010314 *
## YearsCodedJob < 2e-16 ***
## OpenSourceTRUE 8.78e-07 ***
## HobbyTRUE 0.932444
## CompanySizeNumber 6.04e-07 ***
## CareerSatisfaction 0.011323 *
## `Data scientist`TRUE 0.518709
## `Database administrator`TRUE 0.029184 *
## `Desktop applications developer`TRUE 0.003178 **
## `Developer with stats/math background`TRUE 0.834400
## DevOpsTRUE 0.235833
## `Embedded developer`TRUE 0.093039 .
## `Graphic designer`TRUE 0.484596
## `Graphics programming`TRUE 0.641060
## `Machine learning specialist`TRUE 0.408484
## `Mobile developer`TRUE 0.033194 *
## `Quality assurance engineer`TRUE 0.246098
## `Systems administrator`TRUE 0.303507
## `Web developer`TRUE 0.246655
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4627.8 on 6990 degrees of freedom
## Residual deviance: 4268.8 on 6967 degrees of freedom
## AIC: 4316.8
##
## Number of Fisher Scoring iterations: 5
stack_select <- stackoverflow %>%
select(-Respondent)
# Split the data into training and testing sets
set.seed(1234)
in_train <- caret::createDataPartition(stack_select$Remote, p=0.8, list = FALSE)
training <- stack_select[in_train,]
testing <- stack_select[-in_train,]
up_train <- caret::upSample(x = select(training, -Remote), y = training$Remote, yname = "Remote") %>%
as_tibble()
up_train %>%
count(Remote)
## # A tibble: 2 x 2
## Remote n
## <fct> <int>
## 1 Not remote 5019
## 2 Remote 5019
# Sub-sample to 5% of original
inUse <- sample(1:nrow(training), round(0.05*nrow(training)), replace=FALSE)
useTrain <- training[sort(inUse), ]
# Build a logistic regression model
stack_glm <- caret::train(Remote ~ ., method="glm", family="binomial", data = training,
trControl = trainControl(method = "boot", sampling = "up")
)
# Print the model object
stack_glm
## Generalized Linear Model
##
## 5594 samples
## 20 predictor
## 2 classes: 'Not remote', 'Remote'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 5594, 5594, 5594, 5594, 5594, 5594, ...
## Addtional sampling using up-sampling
##
## Resampling results:
##
## Accuracy Kappa
## 0.6568743 0.1279825
# Build a random forest model
stack_rf <- caret::train(Remote ~ ., method="rf", data = useTrain,
trControl = trainControl(method = "boot", sampling="up")
)
# Print the model object
stack_rf
## Random Forest
##
## 280 samples
## 20 predictor
## 2 classes: 'Not remote', 'Remote'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 280, 280, 280, 280, 280, 280, ...
## Addtional sampling using up-sampling
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8626254 0.110738058
## 12 0.9038825 -0.002127159
## 23 0.8887612 0.035777206
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 12.
# Confusion matrix for logistic regression model
caret::confusionMatrix(predict(stack_glm, testing), testing$Remote)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not remote Remote
## Not remote 837 53
## Remote 417 90
##
## Accuracy : 0.6636
## 95% CI : (0.6381, 0.6883)
## No Information Rate : 0.8976
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1395
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6675
## Specificity : 0.6294
## Pos Pred Value : 0.9404
## Neg Pred Value : 0.1775
## Prevalence : 0.8976
## Detection Rate : 0.5991
## Detection Prevalence : 0.6371
## Balanced Accuracy : 0.6484
##
## 'Positive' Class : Not remote
##
# Confusion matrix for random forest model
caret::confusionMatrix(predict(stack_rf, testing), testing$Remote)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not remote Remote
## Not remote 1207 125
## Remote 47 18
##
## Accuracy : 0.8769
## 95% CI : (0.8585, 0.8937)
## No Information Rate : 0.8976
## P-Value [Acc > NIR] : 0.9945
##
## Kappa : 0.1166
## Mcnemar's Test P-Value : 4.327e-09
##
## Sensitivity : 0.9625
## Specificity : 0.1259
## Pos Pred Value : 0.9062
## Neg Pred Value : 0.2769
## Prevalence : 0.8976
## Detection Rate : 0.8640
## Detection Prevalence : 0.9535
## Balanced Accuracy : 0.5442
##
## 'Positive' Class : Not remote
##
# Predict values
testing_results <- testing %>%
mutate(`Logistic regression` = predict(stack_glm, testing), `Random forest` = predict(stack_rf, testing))
## Calculate accuracy
yardstick::accuracy(testing_results, truth = Remote, estimate = `Logistic regression`)
## [1] 0.6635648
yardstick::accuracy(testing_results, truth = Remote, estimate = `Random forest`)
## [1] 0.876879
## Calculate positive predict value
yardstick::ppv(testing_results, truth = Remote, estimate = `Logistic regression`)
## [1] 0.9404494
yardstick::ppv(testing_results, truth = Remote, estimate = `Random forest`)
## [1] 0.9061562
Chapter 3 - Voting
Predicting voter turnout from survey data:
Vote 2016:
Cross-validation is the process of sub-dividing the data into folds, with each fold used once as the validation set:
Comparing model performance:
Example code includes:
voters <- readr::read_csv("./RInputFiles/voters.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## turnout16_2016 = col_character()
## )
## See spec(...) for full column specifications.
voters$turnout16_2016 <- factor(voters$turnout16_2016, levels=c("Did not vote", "Voted"))
str(voters, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 6692 obs. of 43 variables:
## $ case_identifier : int 779 2108 2597 4148 4460 5225 5903 6059 8048 13112 ...
## $ turnout16_2016 : Factor w/ 2 levels "Did not vote",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ RIGGED_SYSTEM_1_2016: int 3 2 2 1 3 3 3 2 4 2 ...
## $ RIGGED_SYSTEM_2_2016: int 4 1 4 4 1 3 4 3 4 3 ...
## $ RIGGED_SYSTEM_3_2016: int 1 3 1 1 3 2 1 3 1 1 ...
## $ RIGGED_SYSTEM_4_2016: int 4 1 4 4 1 2 1 2 3 2 ...
## $ RIGGED_SYSTEM_5_2016: int 3 3 1 2 3 2 2 1 3 2 ...
## $ RIGGED_SYSTEM_6_2016: int 2 2 1 1 2 3 1 2 1 2 ...
## $ track_2016 : int 2 2 1 1 2 2 1 2 2 2 ...
## $ persfinretro_2016 : int 2 3 3 1 2 2 2 3 2 1 ...
## $ econtrend_2016 : int 1 3 3 1 2 2 1 3 1 1 ...
## $ Americatrend_2016 : int 1 1 1 3 3 1 2 3 2 1 ...
## $ futuretrend_2016 : int 4 1 1 3 4 3 1 3 1 1 ...
## $ wealth_2016 : int 2 1 2 2 1 2 2 1 2 2 ...
## $ values_culture_2016 : int 2 3 3 3 3 2 3 3 1 3 ...
## $ US_respect_2016 : int 2 3 1 1 2 2 2 3 3 3 ...
## $ trustgovt_2016 : int 3 3 3 3 3 2 3 3 3 3 ...
## $ trust_people_2016 : int 8 2 1 1 1 2 2 1 2 1 ...
## $ helpful_people_2016 : int 1 1 2 1 1 1 2 2 1 2 ...
## $ fair_people_2016 : int 8 2 1 1 1 2 2 1 2 1 ...
## $ imiss_a_2016 : int 2 1 1 1 1 2 1 1 3 1 ...
## $ imiss_b_2016 : int 2 1 1 2 1 1 1 2 1 1 ...
## $ imiss_c_2016 : int 1 2 2 3 1 2 2 1 4 2 ...
## $ imiss_d_2016 : int 1 2 1 1 1 1 1 2 1 1 ...
## $ imiss_e_2016 : int 1 1 3 1 1 3 1 2 1 1 ...
## $ imiss_f_2016 : int 2 1 1 2 1 2 1 3 2 1 ...
## $ imiss_g_2016 : int 1 4 3 3 3 1 3 4 2 2 ...
## $ imiss_h_2016 : int 1 2 2 2 1 1 1 2 1 1 ...
## $ imiss_i_2016 : int 2 2 4 4 2 1 1 3 2 1 ...
## $ imiss_j_2016 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ imiss_k_2016 : int 1 2 1 1 2 1 1 4 2 1 ...
## $ imiss_l_2016 : int 1 4 1 2 4 1 1 3 1 1 ...
## $ imiss_m_2016 : int 1 2 1 2 1 1 1 1 1 1 ...
## $ imiss_n_2016 : int 1 2 1 1 1 1 1 2 2 1 ...
## $ imiss_o_2016 : int 2 1 1 1 1 2 1 2 2 1 ...
## $ imiss_p_2016 : int 2 1 2 3 1 3 1 1 4 1 ...
## $ imiss_q_2016 : int 1 1 1 2 2 1 1 4 2 1 ...
## $ imiss_r_2016 : int 2 1 1 2 1 2 1 2 4 2 ...
## $ imiss_s_2016 : int 1 2 1 2 2 1 1 1 1 1 ...
## $ imiss_t_2016 : int 1 1 3 3 1 1 3 4 1 1 ...
## $ imiss_u_2016 : int 2 2 2 2 1 3 3 1 4 2 ...
## $ imiss_x_2016 : int 1 3 1 2 1 1 1 4 1 1 ...
## $ imiss_y_2016 : int 1 4 2 3 1 1 1 3 2 1 ...
# Print voters
voters
## # A tibble: 6,692 x 43
## case_identifier turnout16_2016 RIGGED_SYSTEM_1_2016 RIGGED_SYSTEM_2_20~
## <int> <fct> <int> <int>
## 1 779 Voted 3 4
## 2 2108 Voted 2 1
## 3 2597 Voted 2 4
## 4 4148 Voted 1 4
## 5 4460 Voted 3 1
## 6 5225 Voted 3 3
## 7 5903 Voted 3 4
## 8 6059 Voted 2 3
## 9 8048 Voted 4 4
## 10 13112 Voted 2 3
## # ... with 6,682 more rows, and 39 more variables:
## # RIGGED_SYSTEM_3_2016 <int>, RIGGED_SYSTEM_4_2016 <int>,
## # RIGGED_SYSTEM_5_2016 <int>, RIGGED_SYSTEM_6_2016 <int>,
## # track_2016 <int>, persfinretro_2016 <int>, econtrend_2016 <int>,
## # Americatrend_2016 <int>, futuretrend_2016 <int>, wealth_2016 <int>,
## # values_culture_2016 <int>, US_respect_2016 <int>,
## # trustgovt_2016 <int>, trust_people_2016 <int>,
## # helpful_people_2016 <int>, fair_people_2016 <int>, imiss_a_2016 <int>,
## # imiss_b_2016 <int>, imiss_c_2016 <int>, imiss_d_2016 <int>,
## # imiss_e_2016 <int>, imiss_f_2016 <int>, imiss_g_2016 <int>,
## # imiss_h_2016 <int>, imiss_i_2016 <int>, imiss_j_2016 <int>,
## # imiss_k_2016 <int>, imiss_l_2016 <int>, imiss_m_2016 <int>,
## # imiss_n_2016 <int>, imiss_o_2016 <int>, imiss_p_2016 <int>,
## # imiss_q_2016 <int>, imiss_r_2016 <int>, imiss_s_2016 <int>,
## # imiss_t_2016 <int>, imiss_u_2016 <int>, imiss_x_2016 <int>,
## # imiss_y_2016 <int>
# How many people voted?
voters %>%
count(turnout16_2016)
## # A tibble: 2 x 2
## turnout16_2016 n
## <fct> <int>
## 1 Did not vote 264
## 2 Voted 6428
# How do the reponses on the survey vary with voting behavior?
voters %>%
group_by(turnout16_2016) %>%
summarize(`Elections don't matter` = mean(RIGGED_SYSTEM_1_2016 <= 2),
`Economy is getting better` = mean(econtrend_2016 == 1),
`Crime is very important` = mean(imiss_a_2016 == 2))
## # A tibble: 2 x 4
## turnout16_2016 `Elections don't ~ `Economy is gettin~ `Crime is very im~
## <fct> <dbl> <dbl> <dbl>
## 1 Did not vote 0.553 0.163 0.292
## 2 Voted 0.341 0.289 0.342
## Visualize difference by voter turnout
voters %>%
ggplot(aes(econtrend_2016, ..density.., fill = turnout16_2016)) +
geom_histogram(alpha = 0.5, position = "identity", binwidth = 1) +
labs(title = "Overall, is the economy getting better or worse?")
# Remove the case_indetifier column
voters_select <- voters %>%
select(-case_identifier)
# Build a simple logistic regression model
simple_glm <- glm(turnout16_2016 ~ ., family = "binomial",
data = voters_select)
# Print the summary
summary(simple_glm)
##
## Call:
## glm(formula = turnout16_2016 ~ ., family = "binomial", data = voters_select)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2373 0.1651 0.2214 0.3004 1.7708
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.457036 0.732721 3.353 0.000799 ***
## RIGGED_SYSTEM_1_2016 0.236284 0.085081 2.777 0.005484 **
## RIGGED_SYSTEM_2_2016 0.064749 0.089208 0.726 0.467946
## RIGGED_SYSTEM_3_2016 0.049357 0.107352 0.460 0.645680
## RIGGED_SYSTEM_4_2016 -0.074694 0.087583 -0.853 0.393749
## RIGGED_SYSTEM_5_2016 0.190252 0.096454 1.972 0.048556 *
## RIGGED_SYSTEM_6_2016 -0.005881 0.101381 -0.058 0.953740
## track_2016 0.241075 0.121467 1.985 0.047178 *
## persfinretro_2016 -0.040229 0.106714 -0.377 0.706191
## econtrend_2016 -0.295370 0.087224 -3.386 0.000708 ***
## Americatrend_2016 -0.105213 0.080845 -1.301 0.193116
## futuretrend_2016 0.210568 0.071201 2.957 0.003103 **
## wealth_2016 -0.069405 0.026344 -2.635 0.008424 **
## values_culture_2016 -0.041402 0.038670 -1.071 0.284332
## US_respect_2016 -0.068200 0.043785 -1.558 0.119322
## trustgovt_2016 0.315354 0.166655 1.892 0.058456 .
## trust_people_2016 0.040423 0.041518 0.974 0.330236
## helpful_people_2016 -0.037513 0.035353 -1.061 0.288643
## fair_people_2016 -0.017081 0.030170 -0.566 0.571294
## imiss_a_2016 0.397121 0.138987 2.857 0.004273 **
## imiss_b_2016 -0.250803 0.155454 -1.613 0.106667
## imiss_c_2016 0.017536 0.090647 0.193 0.846606
## imiss_d_2016 0.043510 0.122118 0.356 0.721619
## imiss_e_2016 -0.095552 0.078603 -1.216 0.224126
## imiss_f_2016 -0.323280 0.105432 -3.066 0.002168 **
## imiss_g_2016 -0.332034 0.078673 -4.220 2.44e-05 ***
## imiss_h_2016 -0.157298 0.107111 -1.469 0.141954
## imiss_i_2016 0.088695 0.091467 0.970 0.332196
## imiss_j_2016 0.060271 0.138429 0.435 0.663280
## imiss_k_2016 -0.181030 0.082726 -2.188 0.028646 *
## imiss_l_2016 0.274689 0.106781 2.572 0.010098 *
## imiss_m_2016 -0.124269 0.147888 -0.840 0.400746
## imiss_n_2016 -0.441612 0.090040 -4.905 9.36e-07 ***
## imiss_o_2016 0.198635 0.143160 1.388 0.165286
## imiss_p_2016 0.102987 0.105669 0.975 0.329751
## imiss_q_2016 0.244567 0.119093 2.054 0.040017 *
## imiss_r_2016 0.198839 0.121969 1.630 0.103050
## imiss_s_2016 -0.067310 0.134465 -0.501 0.616666
## imiss_t_2016 -0.116757 0.068143 -1.713 0.086639 .
## imiss_u_2016 0.022902 0.097312 0.235 0.813939
## imiss_x_2016 -0.017789 0.097349 -0.183 0.855003
## imiss_y_2016 0.150205 0.094536 1.589 0.112092
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2224.3 on 6691 degrees of freedom
## Residual deviance: 2004.4 on 6650 degrees of freedom
## AIC: 2088.4
##
## Number of Fisher Scoring iterations: 6
# Split data into training and testing sets
set.seed(1234)
in_train <- caret::createDataPartition(voters_select$turnout16_2016, p = 0.8, list = FALSE)
training <- voters_select[in_train, ]
testing <- voters_select[-in_train, ]
# Perform logistic regression with upsampling and no resampling
vote_glm_1 <- caret::train(turnout16_2016 ~ ., method = "glm", family = "binomial", data = training,
trControl = trainControl(method = "none", sampling = "up")
)
# Print vote_glm
vote_glm_1
## Generalized Linear Model
##
## 5355 samples
## 41 predictor
## 2 classes: 'Did not vote', 'Voted'
##
## No pre-processing
## Resampling: None
## Addtional sampling using up-sampling
useSmall <- sort(sample(1:nrow(training), round(0.1*nrow(training)), replace=FALSE))
trainSmall <- training[useSmall, ]
# Logistic regression
vote_glm <- caret::train(turnout16_2016 ~ ., method = "glm", family = "binomial", data = trainSmall,
trControl = trainControl(method = "repeatedcv", repeats = 2, sampling = "up")
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Print vote_glm
vote_glm
## Generalized Linear Model
##
## 536 samples
## 41 predictor
## 2 classes: 'Did not vote', 'Voted'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times)
## Summary of sample sizes: 482, 482, 482, 483, 482, 483, ...
## Addtional sampling using up-sampling
##
## Resampling results:
##
## Accuracy Kappa
## 0.8713138 0.04298445
# Random forest
vote_rf <- caret::train(turnout16_2016 ~ ., method = "rf", data = trainSmall,
trControl = trainControl(method="repeatedcv", repeats=2, sampling = "up")
)
# Print vote_rf
vote_rf
## Random Forest
##
## 536 samples
## 41 predictor
## 2 classes: 'Did not vote', 'Voted'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times)
## Summary of sample sizes: 483, 482, 483, 482, 483, 483, ...
## Addtional sampling using up-sampling
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9674179 -0.001265823
## 21 0.9627184 -0.006073829
## 41 0.9542628 0.019107234
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
# Confusion matrix for logistic regression model on training data
caret::confusionMatrix(predict(vote_glm, trainSmall), trainSmall$turnout16_2016)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Did not vote Voted
## Did not vote 17 48
## Voted 0 471
##
## Accuracy : 0.9104
## 95% CI : (0.883, 0.9332)
## No Information Rate : 0.9683
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3836
## Mcnemar's Test P-Value : 1.17e-11
##
## Sensitivity : 1.00000
## Specificity : 0.90751
## Pos Pred Value : 0.26154
## Neg Pred Value : 1.00000
## Prevalence : 0.03172
## Detection Rate : 0.03172
## Detection Prevalence : 0.12127
## Balanced Accuracy : 0.95376
##
## 'Positive' Class : Did not vote
##
# Confusion matrix for random forest model on training data
caret::confusionMatrix(predict(vote_rf, trainSmall), trainSmall$turnout16_2016)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Did not vote Voted
## Did not vote 17 0
## Voted 0 519
##
## Accuracy : 1
## 95% CI : (0.9931, 1)
## No Information Rate : 0.9683
## P-Value [Acc > NIR] : 3.143e-08
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.00000
## Specificity : 1.00000
## Pos Pred Value : 1.00000
## Neg Pred Value : 1.00000
## Prevalence : 0.03172
## Detection Rate : 0.03172
## Detection Prevalence : 0.03172
## Balanced Accuracy : 1.00000
##
## 'Positive' Class : Did not vote
##
# Confusion matrix for logistic regression model on testing data
caret::confusionMatrix(predict(vote_glm, testing), testing$turnout16_2016)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Did not vote Voted
## Did not vote 14 166
## Voted 38 1119
##
## Accuracy : 0.8474
## 95% CI : (0.827, 0.8663)
## No Information Rate : 0.9611
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0642
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.26923
## Specificity : 0.87082
## Pos Pred Value : 0.07778
## Neg Pred Value : 0.96716
## Prevalence : 0.03889
## Detection Rate : 0.01047
## Detection Prevalence : 0.13463
## Balanced Accuracy : 0.57002
##
## 'Positive' Class : Did not vote
##
# Confusion matrix for random forest model on testing data
caret::confusionMatrix(predict(vote_rf, testing), testing$turnout16_2016)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Did not vote Voted
## Did not vote 1 1
## Voted 51 1284
##
## Accuracy : 0.9611
## 95% CI : (0.9493, 0.9708)
## No Information Rate : 0.9611
## P-Value [Acc > NIR] : 0.5368
##
## Kappa : 0.0343
## Mcnemar's Test P-Value : 1.083e-11
##
## Sensitivity : 0.0192308
## Specificity : 0.9992218
## Pos Pred Value : 0.5000000
## Neg Pred Value : 0.9617978
## Prevalence : 0.0388930
## Detection Rate : 0.0007479
## Detection Prevalence : 0.0014959
## Balanced Accuracy : 0.5092263
##
## 'Positive' Class : Did not vote
##
Chapter 4 - Nuns
Catholic sisters survey from 1967 - https://curate.nd.edu/show/0r967368551 with codebook at https://curate.nd.edu/downloads/0v838051f6x
Exploratory data analysis with tidy data:
Predicting age with supervised learning:
Wrap up:
Example code includes:
sisters67 <- readr::read_csv("./RInputFiles/sisters.csv")
## Parsed with column specification:
## cols(
## .default = col_integer()
## )
## See spec(...) for full column specifications.
str(sisters67, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 19278 obs. of 67 variables:
## $ age : int 40 30 40 30 40 30 70 30 60 80 ...
## $ sister: int 11545 16953 73323 75339 36303 95318 22474 114526 20707 91062 ...
## $ v116 : int 5 4 2 4 4 2 4 4 4 5 ...
## $ v117 : int 2 1 2 3 2 4 5 1 5 1 ...
## $ v118 : int 2 4 5 3 3 5 5 4 5 2 ...
## $ v119 : int 2 4 5 4 5 5 5 5 4 1 ...
## $ v120 : int 4 1 3 3 1 1 5 1 5 2 ...
## $ v121 : int 4 1 4 4 4 5 4 1 5 5 ...
## $ v122 : int 4 1 2 2 4 1 1 2 2 1 ...
## $ v123 : int 5 5 3 4 4 3 1 5 2 5 ...
## $ v124 : int 1 1 5 2 3 1 5 3 5 4 ...
## $ v125 : int 4 2 5 3 4 2 5 2 5 5 ...
## $ v126 : int 2 1 1 3 1 1 5 1 5 2 ...
## $ v127 : int 1 4 5 2 2 1 1 1 4 1 ...
## $ v128 : int 2 1 4 3 4 4 5 2 5 3 ...
## $ v129 : int 4 4 5 4 5 4 5 5 4 1 ...
## $ v130 : int 2 4 4 3 3 1 5 1 5 4 ...
## $ v131 : int 1 2 2 3 5 5 2 3 3 2 ...
## $ v132 : int 5 5 5 4 5 2 2 5 4 5 ...
## $ v133 : int 2 4 5 3 5 1 4 2 4 4 ...
## $ v134 : int 2 4 4 3 4 4 1 4 4 2 ...
## $ v135 : int 5 5 4 3 5 4 1 5 5 2 ...
## $ v136 : int 1 4 4 2 4 4 1 4 4 2 ...
## $ v137 : int 1 1 1 1 1 1 2 1 2 4 ...
## $ v138 : int 2 1 3 1 3 1 4 1 2 1 ...
## $ v139 : int 3 1 3 3 1 1 4 1 5 4 ...
## $ v140 : int 1 2 1 2 4 4 5 2 5 2 ...
## $ v141 : int 5 5 4 3 3 3 4 5 4 4 ...
## $ v142 : int 1 1 2 2 2 1 2 1 4 3 ...
## $ v143 : int 2 1 5 4 4 5 4 5 4 1 ...
## $ v144 : int 1 2 1 2 1 1 3 1 4 2 ...
## $ v145 : int 4 4 5 3 4 1 5 2 5 4 ...
## $ v146 : int 4 4 5 4 5 5 4 5 2 4 ...
## $ v147 : int 2 2 1 2 3 1 2 1 2 2 ...
## $ v148 : int 1 1 4 1 1 4 4 1 5 1 ...
## $ v149 : int 4 2 4 2 1 1 2 1 5 4 ...
## $ v150 : int 2 1 2 3 1 4 2 1 5 2 ...
## $ v151 : int 4 1 5 4 4 1 5 1 4 3 ...
## $ v152 : int 2 1 1 3 1 1 2 1 4 4 ...
## $ v153 : int 5 5 5 5 5 5 5 5 5 2 ...
## $ v154 : int 1 1 4 2 1 3 5 1 4 2 ...
## $ v155 : int 5 4 4 3 5 5 4 5 4 4 ...
## $ v156 : int 1 1 2 2 1 1 5 1 5 2 ...
## $ v157 : int 4 1 4 3 1 1 2 1 3 4 ...
## $ v158 : int 4 4 5 2 5 5 2 5 5 4 ...
## $ v159 : int 1 4 4 1 2 1 4 1 4 2 ...
## $ v160 : int 2 5 5 4 4 4 5 5 5 4 ...
## $ v161 : int 2 4 3 3 1 1 4 1 2 4 ...
## $ v162 : int 5 4 5 4 4 4 5 5 5 4 ...
## $ v163 : int 2 1 2 3 1 1 2 1 4 1 ...
## $ v164 : int 4 1 5 2 4 1 5 1 5 4 ...
## $ v165 : int 2 1 3 2 1 1 1 1 2 2 ...
## $ v166 : int 2 4 5 2 1 1 5 2 5 4 ...
## $ v167 : int 2 4 5 3 4 4 2 4 5 2 ...
## $ v168 : int 5 5 5 4 5 5 5 5 4 5 ...
## $ v169 : int 1 1 1 2 1 1 5 1 4 4 ...
## $ v170 : int 5 1 4 3 2 4 4 1 2 4 ...
## $ v171 : int 5 5 5 4 1 2 5 5 5 5 ...
## $ v172 : int 2 1 5 5 2 2 5 1 5 3 ...
## $ v173 : int 2 2 4 2 2 1 4 1 1 4 ...
## $ v174 : int 2 4 2 3 4 1 5 5 4 2 ...
## $ v175 : int 1 1 4 2 2 1 2 1 5 4 ...
## $ v176 : int 4 4 4 3 1 4 4 3 3 2 ...
## $ v177 : int 4 4 5 3 4 2 4 4 4 4 ...
## $ v178 : int 4 1 4 2 1 1 2 1 4 4 ...
## $ v179 : int 4 4 4 3 4 2 4 4 5 4 ...
## $ v180 : int 4 2 5 3 3 1 1 1 1 2 ...
# View sisters67
glimpse(sisters67)
## Observations: 19,278
## Variables: 67
## $ age <int> 40, 30, 40, 30, 40, 30, 70, 30, 60, 80, 90, 40, 60, 80,...
## $ sister <int> 11545, 16953, 73323, 75339, 36303, 95318, 22474, 114526...
## $ v116 <int> 5, 4, 2, 4, 4, 2, 4, 4, 4, 5, 2, 5, 4, 4, 3, 4, 5, 3, 4...
## $ v117 <int> 2, 1, 2, 3, 2, 4, 5, 1, 5, 1, 3, 2, 5, 4, 1, 1, 1, 1, 2...
## $ v118 <int> 2, 4, 5, 3, 3, 5, 5, 4, 5, 2, 4, 4, 4, 5, 2, 4, 4, 4, 2...
## $ v119 <int> 2, 4, 5, 4, 5, 5, 5, 5, 4, 1, 4, 5, 3, 4, 5, 5, 5, 5, 4...
## $ v120 <int> 4, 1, 3, 3, 1, 1, 5, 1, 5, 2, 3, 1, 5, 4, 4, 1, 1, 1, 2...
## $ v121 <int> 4, 1, 4, 4, 4, 5, 4, 1, 5, 5, 4, 1, 3, 4, 3, 2, 5, 3, 3...
## $ v122 <int> 4, 1, 2, 2, 4, 1, 1, 2, 2, 1, 4, 5, 1, 2, 4, 2, 1, 4, 2...
## $ v123 <int> 5, 5, 3, 4, 4, 3, 1, 5, 2, 5, 3, 4, 3, 4, 5, 5, 4, 5, 4...
## $ v124 <int> 1, 1, 5, 2, 3, 1, 5, 3, 5, 4, 4, 1, 3, 2, 1, 1, 3, 2, 2...
## $ v125 <int> 4, 2, 5, 3, 4, 2, 5, 2, 5, 5, 5, 5, 5, 5, 1, 1, 5, 1, 2...
## $ v126 <int> 2, 1, 1, 3, 1, 1, 5, 1, 5, 2, 4, 1, 5, 1, 3, 1, 5, 1, 2...
## $ v127 <int> 1, 4, 5, 2, 2, 1, 1, 1, 4, 1, 4, 1, 3, 5, 2, 1, 1, 2, 2...
## $ v128 <int> 2, 1, 4, 3, 4, 4, 5, 2, 5, 3, 2, 5, 5, 4, 1, 1, 4, 1, 1...
## $ v129 <int> 4, 4, 5, 4, 5, 4, 5, 5, 4, 1, 5, 1, 5, 5, 5, 1, 5, 5, 5...
## $ v130 <int> 2, 4, 4, 3, 3, 1, 5, 1, 5, 4, 5, 5, 1, 4, 1, 1, 4, 3, 2...
## $ v131 <int> 1, 2, 2, 3, 5, 5, 2, 3, 3, 2, 3, 4, 3, 4, 2, 4, 3, 4, 4...
## $ v132 <int> 5, 5, 5, 4, 5, 2, 2, 5, 4, 5, 4, 5, 5, 5, 4, 5, 3, 5, 5...
## $ v133 <int> 2, 4, 5, 3, 5, 1, 4, 2, 4, 4, 5, 1, 1, 1, 2, 4, 3, 1, 2...
## $ v134 <int> 2, 4, 4, 3, 4, 4, 1, 4, 4, 2, 3, 5, 2, 4, 4, 4, 3, 3, 4...
## $ v135 <int> 5, 5, 4, 3, 5, 4, 1, 5, 5, 2, 4, 5, 3, 5, 2, 5, 3, 5, 5...
## $ v136 <int> 1, 4, 4, 2, 4, 4, 1, 4, 4, 2, 4, 4, 4, 4, 2, 2, 4, 2, 2...
## $ v137 <int> 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 5, 1, 3, 1, 1, 1, 1, 1, 1...
## $ v138 <int> 2, 1, 3, 1, 3, 1, 4, 1, 2, 1, 3, 2, 1, 3, 2, 1, 4, 3, 1...
## $ v139 <int> 3, 1, 3, 3, 1, 1, 4, 1, 5, 4, 4, 1, 2, 4, 1, 1, 2, 1, 1...
## $ v140 <int> 1, 2, 1, 2, 4, 4, 5, 2, 5, 2, 2, 1, 5, 2, 1, 4, 1, 2, 2...
## $ v141 <int> 5, 5, 4, 3, 3, 3, 4, 5, 4, 4, 5, 5, 5, 5, 5, 4, 4, 3, 5...
## $ v142 <int> 1, 1, 2, 2, 2, 1, 2, 1, 4, 3, 4, 2, 2, 3, 2, 2, 1, 3, 1...
## $ v143 <int> 2, 1, 5, 4, 4, 5, 4, 5, 4, 1, 4, 5, 5, 2, 5, 5, 3, 3, 5...
## $ v144 <int> 1, 2, 1, 2, 1, 1, 3, 1, 4, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1...
## $ v145 <int> 4, 4, 5, 3, 4, 1, 5, 2, 5, 4, 4, 1, 4, 5, 2, 2, 1, 2, 2...
## $ v146 <int> 4, 4, 5, 4, 5, 5, 4, 5, 2, 4, 4, 4, 4, 4, 2, 5, 3, 5, 4...
## $ v147 <int> 2, 2, 1, 2, 3, 1, 2, 1, 2, 2, 3, 1, 2, 1, 2, 2, 3, 2, 4...
## $ v148 <int> 1, 1, 4, 1, 1, 4, 4, 1, 5, 1, 4, 1, 3, 1, 1, 1, 2, 1, 1...
## $ v149 <int> 4, 2, 4, 2, 1, 1, 2, 1, 5, 4, 4, 2, 5, 1, 1, 2, 5, 2, 1...
## $ v150 <int> 2, 1, 2, 3, 1, 4, 2, 1, 5, 2, 5, 2, 2, 2, 3, 1, 5, 1, 1...
## $ v151 <int> 4, 1, 5, 4, 4, 1, 5, 1, 4, 3, 4, 1, 2, 5, 2, 4, 5, 1, 4...
## $ v152 <int> 2, 1, 1, 3, 1, 1, 2, 1, 4, 4, 4, 1, 4, 3, 4, 1, 1, 1, 2...
## $ v153 <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 3, 2, 5, 5, 4, 5, 5, 5, 4...
## $ v154 <int> 1, 1, 4, 2, 1, 3, 5, 1, 4, 2, 5, 1, 5, 5, 1, 1, 4, 1, 1...
## $ v155 <int> 5, 4, 4, 3, 5, 5, 4, 5, 4, 4, 3, 4, 3, 5, 2, 5, 5, 5, 1...
## $ v156 <int> 1, 1, 2, 2, 1, 1, 5, 1, 5, 2, 5, 1, 1, 4, 1, 1, 3, 1, 1...
## $ v157 <int> 4, 1, 4, 3, 1, 1, 2, 1, 3, 4, 2, 1, 2, 3, 3, 2, 3, 1, 1...
## $ v158 <int> 4, 4, 5, 2, 5, 5, 2, 5, 5, 4, 4, 5, 4, 2, 5, 4, 4, 3, 4...
## $ v159 <int> 1, 4, 4, 1, 2, 1, 4, 1, 4, 2, 4, 1, 3, 2, 1, 1, 2, 1, 1...
## $ v160 <int> 2, 5, 5, 4, 4, 4, 5, 5, 5, 4, 5, 2, 5, 5, 5, 4, 5, 2, 4...
## $ v161 <int> 2, 4, 3, 3, 1, 1, 4, 1, 2, 4, 5, 1, 4, 5, 1, 1, 3, 1, 1...
## $ v162 <int> 5, 4, 5, 4, 4, 4, 5, 5, 5, 4, 4, 5, 5, 5, 3, 4, 5, 5, 5...
## $ v163 <int> 2, 1, 2, 3, 1, 1, 2, 1, 4, 1, 4, 1, 1, 1, 1, 2, 3, 3, 1...
## $ v164 <int> 4, 1, 5, 2, 4, 1, 5, 1, 5, 4, 4, 1, 1, 5, 1, 4, 3, 1, 4...
## $ v165 <int> 2, 1, 3, 2, 1, 1, 1, 1, 2, 2, 5, 2, 1, 5, 2, 3, 3, 2, 4...
## $ v166 <int> 2, 4, 5, 2, 1, 1, 5, 2, 5, 4, 5, 1, 2, 4, 2, 4, 5, 3, 4...
## $ v167 <int> 2, 4, 5, 3, 4, 4, 2, 4, 5, 2, 4, 4, 2, 5, 2, 4, 3, 2, 4...
## $ v168 <int> 5, 5, 5, 4, 5, 5, 5, 5, 4, 5, 5, 4, 5, 5, 3, 4, 3, 4, 5...
## $ v169 <int> 1, 1, 1, 2, 1, 1, 5, 1, 4, 4, 5, 1, 1, 1, 1, 1, 1, 1, 1...
## $ v170 <int> 5, 1, 4, 3, 2, 4, 4, 1, 2, 4, 3, 3, 3, 5, 4, 3, 5, 3, 4...
## $ v171 <int> 5, 5, 5, 4, 1, 2, 5, 5, 5, 5, 5, 1, 5, 5, 3, 4, 5, 4, 5...
## $ v172 <int> 2, 1, 5, 5, 2, 2, 5, 1, 5, 3, 5, 1, 5, 5, 2, 2, 3, 5, 2...
## $ v173 <int> 2, 2, 4, 2, 2, 1, 4, 1, 1, 4, 4, 1, 2, 5, 4, 4, 3, 1, 4...
## $ v174 <int> 2, 4, 2, 3, 4, 1, 5, 5, 4, 2, 4, 5, 3, 4, 2, 4, 3, 3, 4...
## $ v175 <int> 1, 1, 4, 2, 2, 1, 2, 1, 5, 4, 3, 1, 2, 4, 1, 4, 3, 1, 1...
## $ v176 <int> 4, 4, 4, 3, 1, 4, 4, 3, 3, 2, 5, 5, 3, 5, 3, 1, 3, 3, 2...
## $ v177 <int> 4, 4, 5, 3, 4, 2, 4, 4, 4, 4, 5, 2, 5, 5, 3, 2, 5, 4, 4...
## $ v178 <int> 4, 1, 4, 2, 1, 1, 2, 1, 4, 4, 4, 1, 2, 4, 1, 2, 3, 1, 2...
## $ v179 <int> 4, 4, 4, 3, 4, 2, 4, 4, 5, 4, 5, 2, 5, 5, 3, 1, 5, 3, 4...
## $ v180 <int> 4, 2, 5, 3, 3, 1, 1, 1, 1, 2, 4, 2, 2, 5, 1, 1, 3, 3, 2...
# Plot the histogram
ggplot(sisters67, aes(x = age)) +
geom_histogram(binwidth = 10)
# Tidy the data set
tidy_sisters <- sisters67 %>%
select(-sister) %>%
gather(key, value, -age)
# Print the structure of tidy_sisters
glimpse(tidy_sisters)
## Observations: 1,253,070
## Variables: 3
## $ age <int> 40, 30, 40, 30, 40, 30, 70, 30, 60, 80, 90, 40, 60, 80, ...
## $ key <chr> "v116", "v116", "v116", "v116", "v116", "v116", "v116", ...
## $ value <int> 5, 4, 2, 4, 4, 2, 4, 4, 4, 5, 2, 5, 4, 4, 3, 4, 5, 3, 4,...
# Overall agreement with all questions varied by age
tidy_sisters %>%
group_by(age) %>%
summarize(value = mean(value, na.rm = TRUE))
## # A tibble: 9 x 2
## age value
## <int> <dbl>
## 1 20 2.82
## 2 30 2.81
## 3 40 2.82
## 4 50 2.95
## 5 60 3.10
## 6 70 3.25
## 7 80 3.39
## 8 90 3.55
## 9 100 3.93
# Number of respondents agreed or disagreed overall
tidy_sisters %>%
count(value)
## # A tibble: 5 x 2
## value n
## <int> <int>
## 1 1 326386
## 2 2 211534
## 3 3 160961
## 4 4 277062
## 5 5 277127
# Visualize agreement with age
tidy_sisters %>%
filter(key %in% paste0("v", 153:170)) %>%
group_by(key, value) %>%
summarize(age = mean(age, na.rm = TRUE)) %>%
ggplot(aes(value, age, color = key)) +
geom_line(show.legend = FALSE) +
facet_wrap(~key, nrow = 3)
# Remove the sister column
sisters_select <- sisters67 %>%
select(-sister)
# Build a simple linear regression model
simple_lm <- lm(age ~ .,
data = sisters_select)
# Print the summary of the model
summary(simple_lm)
##
## Call:
## lm(formula = age ~ ., data = sisters_select)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.663 -9.586 -1.207 8.991 53.286
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.59542 1.07173 25.748 < 2e-16 ***
## v116 -0.69014 0.07727 -8.931 < 2e-16 ***
## v117 -0.15914 0.08869 -1.794 0.072786 .
## v118 -0.74668 0.08473 -8.813 < 2e-16 ***
## v119 -0.35314 0.08321 -4.244 2.21e-05 ***
## v120 -0.13875 0.07513 -1.847 0.064813 .
## v121 0.04265 0.07794 0.547 0.584247
## v122 0.05237 0.08086 0.648 0.517208
## v123 -0.96372 0.09061 -10.636 < 2e-16 ***
## v124 0.44543 0.08681 5.131 2.91e-07 ***
## v125 0.50420 0.07425 6.791 1.15e-11 ***
## v126 0.44358 0.08579 5.170 2.36e-07 ***
## v127 -0.04781 0.07915 -0.604 0.545810
## v128 0.04459 0.07595 0.587 0.557162
## v129 0.03044 0.07881 0.386 0.699351
## v130 0.51028 0.08064 6.328 2.54e-10 ***
## v131 -0.54431 0.08417 -6.467 1.02e-10 ***
## v132 -0.02527 0.09337 -0.271 0.786703
## v133 -0.67041 0.07563 -8.864 < 2e-16 ***
## v134 -0.12144 0.09060 -1.340 0.180130
## v135 0.45773 0.10886 4.205 2.63e-05 ***
## v136 -0.08790 0.07438 -1.182 0.237293
## v137 0.74412 0.10230 7.274 3.63e-13 ***
## v138 0.31534 0.10601 2.974 0.002939 **
## v139 1.36585 0.10514 12.990 < 2e-16 ***
## v140 -0.73675 0.07371 -9.995 < 2e-16 ***
## v141 0.50515 0.09355 5.400 6.75e-08 ***
## v142 -0.22168 0.08357 -2.653 0.007992 **
## v143 0.08320 0.08375 0.993 0.320536
## v144 1.09413 0.10870 10.066 < 2e-16 ***
## v145 -0.46821 0.08217 -5.698 1.23e-08 ***
## v146 -0.50063 0.08094 -6.185 6.32e-10 ***
## v147 -0.28499 0.09800 -2.908 0.003640 **
## v148 1.47288 0.09165 16.070 < 2e-16 ***
## v149 -0.29683 0.08562 -3.467 0.000528 ***
## v150 -0.33882 0.08396 -4.036 5.46e-05 ***
## v151 0.79497 0.08901 8.931 < 2e-16 ***
## v152 -0.02073 0.08179 -0.253 0.799906
## v153 -0.53982 0.09110 -5.925 3.17e-09 ***
## v154 0.98930 0.07843 12.614 < 2e-16 ***
## v155 0.96066 0.09897 9.706 < 2e-16 ***
## v156 1.07836 0.09176 11.752 < 2e-16 ***
## v157 0.07577 0.08249 0.918 0.358378
## v158 0.05330 0.08419 0.633 0.526696
## v159 -0.28846 0.08321 -3.467 0.000528 ***
## v160 0.28066 0.08559 3.279 0.001043 **
## v161 0.67235 0.08759 7.677 1.71e-14 ***
## v162 -0.29388 0.10063 -2.920 0.003501 **
## v163 -1.38883 0.09242 -15.027 < 2e-16 ***
## v164 -0.44411 0.07017 -6.329 2.52e-10 ***
## v165 -0.49356 0.09033 -5.464 4.71e-08 ***
## v166 0.24787 0.08329 2.976 0.002924 **
## v167 -0.06290 0.08185 -0.768 0.442236
## v168 0.33712 0.09425 3.577 0.000349 ***
## v169 1.44938 0.08634 16.786 < 2e-16 ***
## v170 1.01626 0.09083 11.189 < 2e-16 ***
## v171 0.90086 0.08359 10.777 < 2e-16 ***
## v172 0.07702 0.07176 1.073 0.283135
## v173 0.76461 0.06936 11.025 < 2e-16 ***
## v174 0.22074 0.07851 2.812 0.004934 **
## v175 0.18369 0.07930 2.316 0.020553 *
## v176 1.03334 0.08996 11.487 < 2e-16 ***
## v177 -0.07908 0.09643 -0.820 0.412153
## v178 -0.08005 0.08250 -0.970 0.331906
## v179 0.29778 0.09251 3.219 0.001289 **
## v180 0.11524 0.08566 1.345 0.178538
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.13 on 19212 degrees of freedom
## Multiple R-squared: 0.3332, Adjusted R-squared: 0.3309
## F-statistic: 147.7 on 65 and 19212 DF, p-value: < 2.2e-16
# Split the data into training and validation/test sets
set.seed(1234)
in_train <- caret::createDataPartition(sisters_select$age, p = 0.6, list = FALSE)
training <- sisters_select[in_train, ]
validation_test <- sisters_select[-in_train, ]
# Split the validation and test sets
set.seed(1234)
in_test <- caret::createDataPartition(validation_test$age, p = 0.5, list = FALSE)
testing <- validation_test[in_test, ]
validation <- validation_test[-in_test, ]
# Fit a CART model
sisters_cart <- caret::train(age ~ ., method = "rpart", data = training)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
# Print the CART model
sisters_cart
## CART
##
## 11569 samples
## 65 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 11569, 11569, 11569, 11569, 11569, 11569, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.02304336 14.61359 0.1724244 12.00686
## 0.04935303 14.89119 0.1403800 12.41303
## 0.11481230 15.54485 0.1046127 13.19914
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.02304336.
inSmall <- sample(1:nrow(training), 500, replace=FALSE)
smallSisters <- training[sort(inSmall), ]
sisters_xgb <- caret::train(age ~ ., method = "xgbTree", data = smallSisters)
sisters_gbm <- caret::train(age ~ ., method = "gbm", data = smallSisters, verbose=FALSE)
# Make predictions on the three models
modeling_results <- validation %>%
mutate(CART = predict(sisters_cart, validation),
XGB = predict(sisters_xgb, validation),
GBM = predict(sisters_gbm, validation))
# View the predictions
modeling_results %>%
select(CART, XGB, GBM)
## # A tibble: 3,854 x 3
## CART XGB GBM
## <dbl> <dbl> <dbl>
## 1 49.5 46.2 44.3
## 2 49.5 61.1 56.5
## 3 58.0 59.9 65.6
## 4 58.0 60.0 61.9
## 5 58.0 71.6 74.6
## 6 49.5 50.9 53.4
## 7 49.5 58.6 55.0
## 8 49.5 42.2 38.0
## 9 41.3 41.7 38.2
## 10 58.0 51.6 50.0
## # ... with 3,844 more rows
# Compare performace
yardstick::metrics(modeling_results, truth = age, estimate = CART)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 14.6 0.163
yardstick::metrics(modeling_results, truth = age, estimate = XGB)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 13.5 0.287
yardstick::metrics(modeling_results, truth = age, estimate = GBM)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 13.6 0.286
# Calculate RMSE
testing %>%
mutate(prediction = predict(sisters_gbm, testing)) %>%
yardstick::rmse(truth = age, estimate = prediction)
## [1] 13.87981
Chapter 1 - Introduction to Process Analysis
Introduction and overview:
Activities as cornerstones of processes:
Components of process data:
Example code includes:
# Load the processmapR package using library
library(processmapR)
##
## Attaching package: 'processmapR'
## The following object is masked from 'package:stats':
##
## frequency
library(bupaR)
## Loading required package: edeaR
## Loading required package: eventdataR
## Loading required package: xesreadR
## Loading required package: processmonitR
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'processmonitR'
## Loading required package: petrinetR
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'petrinetR'
##
## Attaching package: 'bupaR'
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:utils':
##
## timestamp
handling <- c('Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'X-Ray', 'X-Ray', 'X-Ray', 'X-Ray', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'X-Ray', 'X-Ray', 'X-Ray', 'X-Ray', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out')
patient <- c('43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '156', '170', '172', '184', '278', '348', '420', '43', '156', '170', '172', '184', '278', '348', '420', '155', '221', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '156', '170', '172', '184', '278', '348', '420', '43', '156', '170', '172', '184', '278', '348', '420', '155', '221', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493')
employee <- c('r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r5', 'r5', 'r5', 'r5', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r5', 'r5', 'r5', 'r5', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7')
handling_id <- c('43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '543', '655', '656', '670', '672', '684', '721', '778', '848', '920', '955', '993', '1020', '1072', '1081', '1082', '1088', '1127', '1163', '1199', '1257', '1309', '1318', '1319', '1325', '1364', '1400', '1436', '1557', '1587', '1710', '1730', '1777', '1889', '1890', '1904', '1906', '1918', '1955', '2012', '2082', '2154', '2189', '2227', '2272', '2384', '2385', '2399', '2401', '2413', '2450', '2507', '2577', '2649', '2684', '2720', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '543', '655', '656', '670', '672', '684', '721', '778', '848', '920', '955', '993', '1020', '1072', '1081', '1082', '1088', '1127', '1163', '1199', '1257', '1309', '1318', '1319', '1325', '1364', '1400', '1436', '1557', '1587', '1710', '1730', '1777', '1889', '1890', '1904', '1906', '1918', '1955', '2012', '2082', '2154', '2189', '2227', '2272', '2384', '2385', '2399', '2401', '2413', '2450', '2507', '2577', '2649', '2684', '2720')
registration_type <- c('start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete')
rTime <- c('2017-02-19 04:38:51', '2017-06-03 10:05:28', '2017-06-03 10:05:28', '2017-06-17 15:10:30', '2017-06-17 23:00:33', '2017-06-27 07:48:22', '2017-08-03 17:05:27', '2017-09-26 20:22:49', '2017-11-24 08:28:44', '2018-02-08 03:39:21', '2018-03-14 21:04:28', '2018-04-29 04:55:10', '2017-02-19 07:28:53', '2017-06-04 06:27:00', '2017-06-03 13:23:14', '2017-06-17 16:31:58', '2017-06-18 18:29:13', '2017-06-28 00:14:50', '2017-08-04 07:22:06', '2017-09-27 22:57:03', '2017-11-24 10:33:00', '2018-02-08 17:33:12', '2018-03-15 15:12:41', '2018-04-30 19:40:22', '2017-02-20 19:59:18', '2017-06-04 15:18:50', '2017-06-18 22:51:07', '2017-06-21 02:43:27', '2017-07-01 23:55:10', '2017-09-28 22:58:23', '2017-11-25 12:06:18', '2018-02-12 09:01:38', '2017-02-21 06:49:49', '2017-06-04 23:23:28', '2017-06-19 06:44:30', '2017-06-21 11:16:30', '2017-07-02 11:16:08', '2017-09-29 07:28:10', '2017-11-25 21:54:56', '2018-02-12 19:43:42', '2017-06-05 00:12:24', '2017-08-05 08:25:17', '2018-03-17 10:30:24', '2018-05-02 07:32:45', '2017-02-21 14:50:43', '2017-06-05 14:03:19', '2017-06-05 10:26:16', '2017-06-19 22:46:10', '2017-06-22 04:39:35', '2017-07-03 01:28:49', '2017-08-05 22:06:23', '2017-09-29 19:13:51', '2017-11-26 06:52:23', '2018-02-17 02:44:58', '2018-03-18 00:20:51', '2018-05-02 18:14:11', '2017-02-24 14:58:43', '2017-06-05 15:58:53', '2017-06-05 15:58:53', '2017-06-20 03:48:37', '2017-06-22 08:40:55', '2017-07-03 03:39:51', '2017-08-08 23:17:45', '2017-09-29 21:16:01', '2017-11-27 04:56:53', '2018-02-20 09:49:29', '2018-03-18 08:12:07', '2018-05-03 00:11:10', '2017-02-19 07:28:53', '2017-06-03 14:19:00', '2017-06-03 13:23:14', '2017-06-17 16:31:58', '2017-06-18 01:07:42', '2017-06-27 12:22:51', '2017-08-03 19:25:12', '2017-09-26 22:17:18', '2017-11-24 10:33:00', '2018-02-08 06:01:38', '2018-03-15 00:34:01', '2018-04-29 07:39:14', '2017-02-19 21:58:08', '2017-06-04 14:23:26', '2017-06-04 06:27:00', '2017-06-18 04:14:55', '2017-06-19 00:40:19', '2017-06-28 12:48:20', '2017-08-04 21:09:17', '2017-09-28 12:00:12', '2017-11-25 00:44:30', '2018-02-09 07:05:52', '2018-03-16 04:08:03', '2018-05-01 10:37:51', '2017-02-21 03:12:26', '2017-06-04 19:35:51', '2017-06-19 03:01:11', '2017-06-21 08:02:20', '2017-07-02 07:43:48', '2017-09-29 04:58:49', '2017-11-25 18:30:43', '2018-02-12 13:57:13', '2017-02-21 09:57:05', '2017-06-05 02:46:59', '2017-06-19 11:40:53', '2017-06-21 16:09:26', '2017-07-02 16:03:16', '2017-09-29 12:44:39', '2017-11-26 02:40:30', '2018-02-12 23:53:46', '2017-06-05 04:39:38', '2017-08-05 13:56:39', '2018-03-17 14:09:40', '2018-05-02 12:24:41', '2017-02-21 17:57:58', '2017-06-05 15:58:53', '2017-06-05 14:03:19', '2017-06-20 01:44:29', '2017-06-22 08:40:55', '2017-07-03 03:39:51', '2017-08-05 23:53:27', '2017-09-29 21:16:01', '2017-11-26 09:44:37', '2018-02-17 06:17:57', '2018-03-18 03:22:17', '2018-05-02 21:17:12', '2017-02-24 16:03:49', '2017-06-05 17:22:16', '2017-06-05 17:15:30', '2017-06-20 05:36:40', '2017-06-22 10:59:58', '2017-07-03 05:00:48', '2017-08-09 00:13:39', '2017-09-29 23:42:48', '2017-11-27 06:53:23', '2018-02-20 12:04:00', '2018-03-18 10:48:34', '2018-05-03 02:11:42')
rOrder <- c(43, 155, 156, 170, 172, 184, 221, 278, 348, 420, 455, 493, 543, 655, 656, 670, 672, 684, 721, 778, 848, 920, 955, 993, 1020, 1072, 1081, 1082, 1088, 1127, 1163, 1199, 1257, 1309, 1318, 1319, 1325, 1364, 1400, 1436, 1557, 1587, 1710, 1730, 1777, 1889, 1890, 1904, 1906, 1918, 1955, 2012, 2082, 2154, 2189, 2227, 2272, 2384, 2385, 2399, 2401, 2413, 2450, 2507, 2577, 2649, 2684, 2720, 2764, 2876, 2877, 2891, 2893, 2905, 2942, 2999, 3069, 3141, 3176, 3214, 3264, 3376, 3377, 3391, 3393, 3405, 3442, 3499, 3569, 3641, 3676, 3714, 3741, 3793, 3802, 3803, 3809, 3848, 3884, 3920, 3978, 4030, 4039, 4040, 4046, 4085, 4121, 4157, 4278, 4308, 4431, 4451, 4498, 4610, 4611, 4625, 4627, 4639, 4676, 4733, 4803, 4875, 4910, 4948, 4993, 5105, 5106, 5120, 5122, 5134, 5171, 5228, 5298, 5370, 5405, 5441)
pFrame <- tibble(handling=factor(handling, levels=c('Blood test', 'Check-out', 'Discuss Results', 'MRI SCAN', 'Registration', 'Triage and Assessment', 'X-Ray')),
patient=patient,
employee=factor(employee, levels=c('r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7')),
handling_id=handling_id,
registration_type=factor(registration_type, levels=c("complete", "start")),
time=as.POSIXct(rTime),
.order=rOrder
)
patients <- eventlog(pFrame,
case_id = "patient",
activity_id = "handling",
activity_instance_id = "handling_id",
lifecycle_id = "registration_type",
timestamp = "time",
resource_id = "employee")
# The function slice can be used to take a slice of cases out of the eventdata. slice(1:10) will select the first ten cases in the event log, where first is defined by the current ordering of the data.
# How many patients are there?
n_cases(patients)
## [1] 12
# Print the summary of the data
summary(patients)
## Number of events: 136
## Number of cases: 12
## Number of traces: 2
## Number of distinct activities: 7
## Average trace length: 11.33333
##
## Start eventlog: 2017-02-19 04:38:51
## End eventlog: 2018-05-03 02:11:42
## handling patient employee handling_id
## Blood test :16 Length:136 r1:24 Length:136
## Check-out :24 Class :character r2:24 Class :character
## Discuss Results :24 Mode :character r3:16 Mode :character
## MRI SCAN :16 r4:16
## Registration :24 r5: 8
## Triage and Assessment:24 r6:24
## X-Ray : 8 r7:24
## registration_type time .order
## complete:68 Min. :2017-02-19 04:38:51 Min. : 1.00
## start :68 1st Qu.:2017-06-14 15:43:26 1st Qu.: 34.75
## Median :2017-07-03 03:39:51 Median : 68.50
## Mean :2017-09-06 11:31:32 Mean : 68.50
## 3rd Qu.:2017-11-26 14:32:41 3rd Qu.:102.25
## Max. :2018-05-03 02:11:42 Max. :136.00
##
# Show the journey of the first patient
slice(patients, 1)
## Event log consisting of:
## 12 events
## 1 traces
## 1 cases
## 6 activities
## 6 activity instances
##
## # A tibble: 12 x 7
## handling patient employee handling_id registration_ty~
## <fct> <chr> <fct> <chr> <fct>
## 1 Registr~ 43 r1 43 start
## 2 Triage ~ 43 r2 543 start
## 3 Blood t~ 43 r3 1020 start
## 4 MRI SCAN 43 r4 1257 start
## 5 Discuss~ 43 r6 1777 start
## 6 Check-o~ 43 r7 2272 start
## 7 Registr~ 43 r1 43 complete
## 8 Triage ~ 43 r2 543 complete
## 9 Blood t~ 43 r3 1020 complete
## 10 MRI SCAN 43 r4 1257 complete
## 11 Discuss~ 43 r6 1777 complete
## 12 Check-o~ 43 r7 2272 complete
## # ... with 2 more variables: time <dttm>, .order <int>
# How many distinct activities are there?
n_activities(patients)
## [1] 7
# What are the names of the activities?
activity_labels(patients)
## [1] Registration Triage and Assessment Blood test
## [4] MRI SCAN X-Ray Discuss Results
## [7] Check-out
## 7 Levels: Blood test Check-out Discuss Results MRI SCAN ... X-Ray
# Create a list of activities
activities(patients)
## # A tibble: 7 x 3
## handling absolute_frequency relative_frequency
## <fct> <int> <dbl>
## 1 Check-out 12 0.176
## 2 Discuss Results 12 0.176
## 3 Registration 12 0.176
## 4 Triage and Assessment 12 0.176
## 5 Blood test 8 0.118
## 6 MRI SCAN 8 0.118
## 7 X-Ray 4 0.0588
# Have a look at the different traces
traces(patients)
## # A tibble: 2 x 3
## trace absolute_frequen~ relative_freque~
## <chr> <int> <dbl>
## 1 Registration,Triage and Assessment,B~ 8 0.667
## 2 Registration,Triage and Assessment,X~ 4 0.333
# How many are there?
n_traces(patients)
## [1] 2
# Visualize the traces using trace_explorer
trace_explorer(patients, coverage=1)
# Draw process map
process_map(patients)
claims <- tibble(id=c("claim1", "claim1", "claim2", "claim2", "claim2"),
action=c(10002L, 10011L, 10015L, 10024L, 10024L),
action_type=c("Check Contract", "Pay Back Decision", "Check Contract", "Pay Back Decision", "Pay Back Decision"),
date=as.Date(c("2008-01-12", "2008-03-22", "2008-01-13", "2008-03-23", "2008-04-14")),
originator=c("Assistant 1", "Manager 2", "Assistant 6", "Manager 2", "Manager 2"),
status=as.factor(c("start", "start", "start", "start", "complete"))
)
claims
## # A tibble: 5 x 6
## id action action_type date originator status
## <chr> <int> <chr> <date> <chr> <fct>
## 1 claim1 10002 Check Contract 2008-01-12 Assistant 1 start
## 2 claim1 10011 Pay Back Decision 2008-03-22 Manager 2 start
## 3 claim2 10015 Check Contract 2008-01-13 Assistant 6 start
## 4 claim2 10024 Pay Back Decision 2008-03-23 Manager 2 start
## 5 claim2 10024 Pay Back Decision 2008-04-14 Manager 2 complete
#create eventlog claims_log
claims_log <- eventlog(claims,
case_id = "id",
activity_id = "action_type",
activity_instance_id = "action",
lifecycle_id = "status",
timestamp = "date",
resource_id = "originator")
# Print summary
summary(claims_log)
## Number of events: 5
## Number of cases: 2
## Number of traces: 1
## Number of distinct activities: 2
## Average trace length: 2.5
##
## Start eventlog: 2008-01-12
## End eventlog: 2008-04-14
## id action action_type
## Length:5 Length:5 Check Contract :2
## Class :character Class :character Pay Back Decision:3
## Mode :character Mode :character
##
##
##
## date originator status .order
## Min. :2008-01-12 Assistant 1:1 complete:1 Min. :1
## 1st Qu.:2008-01-13 Assistant 6:1 start :4 1st Qu.:2
## Median :2008-03-22 Manager 2 :3 Median :3
## Mean :2008-02-28 Mean :3
## 3rd Qu.:2008-03-23 3rd Qu.:4
## Max. :2008-04-14 Max. :5
# Check activity labels
activity_labels(claims_log)
## [1] Check Contract Pay Back Decision
## Levels: Check Contract Pay Back Decision
# Once you have an eventlog, you can access its complete metadata using the function mapping or the functions case_id, activity_id etc., to inspect individual identifiers.
Chapter 2 - Analysis Techniques
Organizational analysis:
Structuredness:
Performance analysis:
Linking perspectives:
Example code includes:
data(sepsis, package="eventdataR")
str(sepsis)
## Classes 'eventlog', 'tbl_df', 'tbl' and 'data.frame': 15214 obs. of 34 variables:
## $ case_id : chr "A" "A" "A" "A" ...
## $ activity : Factor w/ 16 levels "Admission IC",..: 4 10 3 9 6 5 8 7 2 3 ...
## $ lifecycle : Factor w/ 1 level "complete": 1 1 1 1 1 1 1 1 1 1 ...
## $ resource : Factor w/ 26 levels "?","A","B","C",..: 2 3 3 3 4 2 2 2 5 3 ...
## $ timestamp : POSIXct, format: "2014-10-22 11:15:41" "2014-10-22 11:27:00" ...
## $ age : int 85 NA NA NA NA NA NA NA NA NA ...
## $ crp : num NA NA 210 NA NA NA NA NA NA 1090 ...
## $ diagnose : chr "A" NA NA NA ...
## $ diagnosticartastrup : chr "true" NA NA NA ...
## $ diagnosticblood : chr "true" NA NA NA ...
## $ diagnosticecg : chr "true" NA NA NA ...
## $ diagnosticic : chr "true" NA NA NA ...
## $ diagnosticlacticacid : chr "true" NA NA NA ...
## $ diagnosticliquor : chr "false" NA NA NA ...
## $ diagnosticother : chr "false" NA NA NA ...
## $ diagnosticsputum : chr "false" NA NA NA ...
## $ diagnosticurinaryculture : chr "true" NA NA NA ...
## $ diagnosticurinarysediment: chr "true" NA NA NA ...
## $ diagnosticxthorax : chr "true" NA NA NA ...
## $ disfuncorg : chr "true" NA NA NA ...
## $ hypotensie : chr "true" NA NA NA ...
## $ hypoxie : chr "false" NA NA NA ...
## $ infectionsuspected : chr "true" NA NA NA ...
## $ infusion : chr "true" NA NA NA ...
## $ lacticacid : chr NA NA NA "2.2" ...
## $ leucocytes : chr NA "9.6" NA NA ...
## $ oligurie : chr "false" NA NA NA ...
## $ sirscritheartrate : chr "true" NA NA NA ...
## $ sirscritleucos : chr "false" NA NA NA ...
## $ sirscrittachypnea : chr "true" NA NA NA ...
## $ sirscrittemperature : chr "true" NA NA NA ...
## $ sirscriteria2ormore : chr "true" NA NA NA ...
## $ activity_instance_id : chr "1" "2" "3" "4" ...
## $ .order : int 1 2 3 4 5 6 7 8 9 10 ...
## - attr(*, "case_id")= chr "case_id"
## - attr(*, "activity_id")= chr "activity"
## - attr(*, "activity_instance_id")= chr "activity_instance_id"
## - attr(*, "lifecycle_id")= chr "lifecycle"
## - attr(*, "resource_id")= chr "resource"
## - attr(*, "timestamp")= chr "timestamp"
# Print list of resources
resource_frequency(sepsis, level="resource")
## # A tibble: 26 x 3
## resource absolute relative
## <fct> <int> <dbl>
## 1 B 8111 0.533
## 2 A 3462 0.228
## 3 C 1053 0.0692
## 4 E 782 0.0514
## 5 ? 294 0.0193
## 6 F 216 0.0142
## 7 L 213 0.0140
## 8 O 186 0.0122
## 9 G 148 0.00973
## 10 I 126 0.00828
## # ... with 16 more rows
# Number of resources per activity
resource_frequency(sepsis, level = "activity")
## # A tibble: 16 x 11
## activity nr_of_resources min q1 mean median q3 max st_dev
## <fct> <int> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 Admissi~ 4 1 7 29.2 31 53.2 54 28.2
## 2 Admissi~ 20 1 17 59.1 40.5 68.2 216 62.7
## 3 CRP 1 3262 3262 3262 3262 3262 3262 NA
## 4 ER Regi~ 2 65 295 525 525 755 985 651.
## 5 ER Seps~ 2 65 295. 524. 524. 754. 984 650.
## 6 ER Tria~ 1 1053 1053 1053 1053 1053 1053 NA
## 7 IV Anti~ 2 45 228. 412. 412. 595. 778 518.
## 8 IV Liqu~ 2 38 207. 376. 376. 546. 715 479.
## 9 LacticA~ 1 1466 1466 1466 1466 1466 1466 NA
## 10 Leucocy~ 1 3383 3383 3383 3383 3383 3383 NA
## 11 Release~ 1 671 671 671 671 671 671 NA
## 12 Release~ 1 56 56 56 56 56 56 NA
## 13 Release~ 1 25 25 25 25 25 25 NA
## 14 Release~ 1 24 24 24 24 24 24 NA
## 15 Release~ 1 6 6 6 6 6 6 NA
## 16 Return ~ 1 294 294 294 294 294 294 NA
## # ... with 2 more variables: iqr <dbl>, total <int>
# Plot Number of executions per resource-activity
resource_frequency(sepsis, level = "resource-activity") %>% plot()
# Calculate resource involvement
resource_involvement(sepsis, level="resource")
## # A tibble: 26 x 3
## resource absolute relative
## <fct> <int> <dbl>
## 1 C 1050 1
## 2 B 1013 0.965
## 3 A 985 0.938
## 4 E 782 0.745
## 5 ? 294 0.28
## 6 F 200 0.190
## 7 O 179 0.170
## 8 G 147 0.14
## 9 I 118 0.112
## 10 M 82 0.0781
## # ... with 16 more rows
# Show graphically
sepsis %>% resource_involvement(level = "resource") %>% plot
# Compare with resource frequency
resource_frequency(sepsis, level="resource")
## # A tibble: 26 x 3
## resource absolute relative
## <fct> <int> <dbl>
## 1 B 8111 0.533
## 2 A 3462 0.228
## 3 C 1053 0.0692
## 4 E 782 0.0514
## 5 ? 294 0.0193
## 6 F 216 0.0142
## 7 L 213 0.0140
## 8 O 186 0.0122
## 9 G 148 0.00973
## 10 I 126 0.00828
## # ... with 16 more rows
# Min, max and average number of repetitions
sepsis %>% number_of_repetitions(level = "log")
## Using default type: all
## min q1 median mean q3 max st_dev iqr
## 0.000000 0.000000 2.000000 1.640000 3.000000 5.000000 1.280461 3.000000
## attr(,"type")
## [1] "all"
# Plot repetitions per activity
sepsis %>% number_of_repetitions(level = "activity") %>% plot
## Using default type: all
# Number of repetitions per resources
sepsis %>% number_of_repetitions(level = "resource")
## Using default type: all
## # resource_metric [26 x 3]
## first_resource absolute relative
## <fct> <dbl> <dbl>
## 1 ? 0 0
## 2 A 0 0
## 3 B 1536 0.189
## 4 C 3 0.00285
## 5 D 0 0
## 6 E 0 0
## 7 F 16 0.0741
## 8 G 67 0.453
## 9 H 6 0.109
## 10 I 12 0.0952
## # ... with 16 more rows
eci <- c('21', '21', '21', '21', '21', '21', '21', '21', '21', '31', '31', '31', '31', '31', '31', '31', '31', '31', '31', '41', '41', '41', '41', '41', '41', '41', '51', '51', '51', '51', '51', '51', '51', '61', '61', '61', '61', '61', '61', '91', '91', '91', '91', '91', '91', '101', '101', '101', '101', '101', '101', '111', '111', '111', '111', '121', '121', '121', '121', '121', '121', '121', '121', '121', '131', '131', '131', '131', '131', '131', '131', '131', '161', '161', '171', '171', '171', '171', '181', '181', '181', '181', '181', '181', '201', '201', '201', '201', '201', '201', '201', '12', '12', '12', '12', '12', '22', '22', '22', '22', '22', '22', '32', '32', '32', '32', '32', '32', '42', '42', '42', '42', '52', '52', '52', '52', '52', '82', '82', '82', '82', '82', '92', '92', '92', '92', '92', '102', '102', '102', '102', '102', '112', '112', '122', '122', '21', '21', '21', '21', '21', '21', '21', '21', '21', '31', '31', '31', '31', '31', '31', '31', '31', '31', '31', '41', '41', '41', '41', '41', '41', '41', '51', '51', '51', '51', '51', '51', '51', '61', '61', '61', '61', '61', '61', '91', '91', '91', '91', '91', '91', '101', '101', '101', '101', '101', '101', '111', '111', '111', '111', '121', '121', '121', '121', '121', '121', '121', '121', '121', '131', '131', '131', '131', '131', '131', '131', '131', '161', '161', '171', '171', '171', '171', '181', '181', '181', '181', '181', '181', '201', '201', '201', '201', '201', '201', '201', '12', '12', '12', '12', '12', '22', '22', '22', '22', '22', '22', '32', '32', '32', '32', '32', '32', '42', '42', '42', '42', '52', '52', '52', '52', '52', '82', '82', '82', '82', '82', '92', '92', '92', '92', '92', '102', '102', '102', '102', '102', '112', '112', '122', '122')
ea1 <- c('prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareBreakfast', 'eatingBreakfast', 'prepareDinner', 'eatingDinner', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareDinner', 'eatingDinner', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'prepareBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'snack', 'eatingBreakfast', 'prepareBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'eatingBreakfast', 'prepareBreakfast', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'snack', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'eatingLunch', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareBreakfast')
ea2 <- c('eatingBreakfast', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareBreakfast', 'eatingBreakfast', 'prepareDinner', 'eatingDinner', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareDinner', 'eatingDinner', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'prepareBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'snack', 'eatingBreakfast', 'prepareBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'eatingBreakfast', 'prepareBreakfast', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'snack', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'eatingLunch', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareBreakfast', 'eatingBreakfast')
eaii <- c('9', '10', '19', '23', '24', '26', '36', '40', '41', '51', '52', '58', '60', '62', '63', '67', '69', '72', '73', '86', '87', '89', '90', '104', '105', '107', '119', '120', '128', '132', '133', '138', '139', '149', '150', '156', '159', '160', '164', '174', '175', '192', '194', '195', '198', '205', '206', '208', '211', '213', '214', '229', '236', '237', '239', '245', '251', '252', '253', '255', '259', '260', '262', '264', '271', '276', '281', '287', '292', '293', '297', '299', '310', '312', '331', '332', '336', '347', '363', '364', '374', '376', '387', '389', '434', '435', '447', '448', '450', '453', '454', '462', '463', '471', '472', '475', '483', '484', '487', '491', '492', '496', '508', '509', '512', '517', '518', '522', '536', '540', '541', '543', '562', '563', '565', '566', '572', '584', '585', '589', '590', '598', '615', '616', '618', '619', '627', '639', '640', '642', '643', '653', '665', '666', '682', '683', '9', '10', '19', '23', '24', '26', '36', '40', '41', '51', '52', '58', '60', '62', '63', '67', '69', '72', '73', '86', '87', '89', '90', '104', '105', '107', '119', '120', '128', '132', '133', '138', '139', '149', '150', '156', '159', '160', '164', '174', '175', '192', '194', '195', '198', '205', '206', '208', '211', '213', '214', '229', '236', '237', '239', '245', '251', '252', '253', '255', '259', '260', '262', '264', '271', '276', '281', '287', '292', '293', '297', '299', '310', '312', '331', '332', '336', '347', '363', '364', '374', '376', '387', '389', '434', '435', '447', '448', '450', '453', '454', '462', '463', '471', '472', '475', '483', '484', '487', '491', '492', '496', '508', '509', '512', '517', '518', '522', '536', '540', '541', '543', '562', '563', '565', '566', '572', '584', '585', '589', '590', '598', '615', '616', '618', '619', '627', '639', '640', '642', '643', '653', '665', '666', '682', '683')
elci <- c('start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete')
ets1 <- c('2012-11-12 09:42:02', '2012-11-12 09:52:33', '2012-11-12 11:05:44', '2012-11-12 13:45:49', '2012-11-12 13:48:49', '2012-11-12 15:23:00', '2012-11-12 18:47:29', '2012-11-12 22:35:21', '2012-11-12 22:35:21', '2012-11-13 08:56:37', '2012-11-13 09:04:54', '2012-11-13 10:14:04', '2012-11-13 13:47:45', '2012-11-13 14:08:24', '2012-11-13 14:19:01', '2012-11-13 17:34:23', '2012-11-13 18:51:51', '2012-11-13 23:05:07', '2012-11-13 23:17:07', '2012-11-14 09:06:08', '2012-11-14 09:17:48', '2012-11-14 10:38:16', '2012-11-14 10:44:16', '2012-11-14 21:30:09', '2012-11-14 21:37:09', '2012-11-14 22:14:23', '2012-11-15 09:37:15', '2012-11-15 09:47:12', '2012-11-15 10:11:08', '2012-11-15 14:35:27', '2012-11-15 14:41:27', '2012-11-15 22:07:26', '2012-11-15 22:26:02', '2012-11-16 10:39:14', '2012-11-16 10:52:56', '2012-11-16 12:09:10', '2012-11-16 14:13:00', '2012-11-16 14:19:00', '2012-11-16 18:11:36', '2012-11-19 10:13:23', '2012-11-19 10:25:00', '2012-11-19 15:55:22', '2012-11-19 21:47:27', '2012-11-19 21:59:27', '2012-11-19 22:31:06', '2012-11-20 10:20:00', '2012-11-20 10:21:02', '2012-11-20 11:00:16', '2012-11-20 13:03:28', '2012-11-20 14:25:11', '2012-11-20 14:41:22', '2012-11-21 10:01:00', '2012-11-21 15:02:08', '2012-11-21 15:15:08', '2012-11-21 17:50:29', '2012-11-22 01:40:42', '2012-11-22 10:19:15', '2012-11-22 10:26:15', '2012-11-22 11:02:27', '2012-11-22 11:56:06', '2012-11-22 15:05:51', '2012-11-22 15:12:55', '2012-11-22 16:43:08', '2012-11-22 18:15:32', '2012-11-23 00:36:00', '2012-11-23 01:03:00', '2012-11-23 09:49:00', '2012-11-23 12:53:06', '2012-11-23 14:01:08', '2012-11-23 14:23:08', '2012-11-23 16:57:24', '2012-11-23 17:58:00', '2012-11-26 09:06:12', '2012-11-26 09:57:12', '2012-11-27 10:20:26', '2012-11-27 10:30:50')
ets2 <- c('2012-11-27 11:54:15', '2012-11-27 19:46:15', '2012-11-28 09:27:15', '2012-11-28 09:34:15', '2012-11-28 12:28:02', '2012-11-28 13:16:33', '2012-11-28 19:30:08', '2012-11-28 22:15:02', '2012-11-30 10:43:19', '2012-11-30 10:46:19', '2012-11-30 14:51:36', '2012-11-30 15:08:36', '2012-11-30 17:30:40', '2012-11-30 22:12:05', '2012-11-30 22:16:07', '2011-11-28 10:38:00', '2011-11-28 10:43:00', '2011-11-28 14:31:06', '2011-11-28 14:42:00', '2011-11-28 20:20:55', '2011-11-29 12:09:09', '2011-11-29 12:11:01', '2011-11-29 13:25:29', '2011-11-29 15:15:14', '2011-11-29 15:23:00', '2011-11-29 16:32:20', '2011-11-30 10:23:46', '2011-11-30 10:28:46', '2011-11-30 13:05:27', '2011-11-30 14:39:42', '2011-11-30 14:56:00', '2011-11-30 16:41:05', '2011-11-30 14:37:00', '2011-12-01 11:17:05', '2011-12-01 11:20:05', '2011-12-01 14:29:37', '2011-12-02 12:29:08', '2011-12-02 12:32:08', '2011-12-02 14:47:18', '2011-12-02 14:51:00', '2011-12-02 19:40:44', '2011-12-05 12:15:45', '2011-12-05 12:18:05', '2011-12-05 15:00:55', '2011-12-05 15:14:00', '2011-12-05 19:24:11', '2011-12-06 11:30:19', '2011-12-06 11:33:02', '2011-12-06 14:41:16', '2011-12-06 14:56:00', '2011-12-06 19:22:50', '2011-12-07 11:12:17', '2011-12-07 11:17:22', '2011-12-07 14:04:32', '2011-12-07 14:14:00', '2011-12-07 19:23:55', '2011-12-08 11:25:12', '2011-12-08 11:29:01', '2011-12-09 11:00:13', '2011-12-09 11:03:33', '2012-11-12 09:50:02', '2012-11-12 09:55:29', '2012-11-12 12:39:42', '2012-11-12 14:48:14', '2012-11-12 14:53:14', '2012-11-12 15:31:53', '2012-11-12 19:00:56', '2012-11-12 22:37:55', '2012-11-12 22:40:55', '2012-11-13 09:00:26', '2012-11-13 09:10:12', '2012-11-13 10:51:55', '2012-11-13 14:03:31', '2012-11-13 14:18:36', '2012-11-13 14:42:36', '2012-11-13 17:36:34', '2012-11-13 19:45:03', '2012-11-13 23:15:33', '2012-11-13 23:37:33', '2012-11-14 09:09:41', '2012-11-14 09:21:43', '2012-11-14 11:43:23', '2012-11-14 11:06:23', '2012-11-14 21:35:17', '2012-11-14 21:47:18', '2012-11-14 22:17:47', '2012-11-15 09:44:06', '2012-11-15 09:48:08', '2012-11-15 10:23:49', '2012-11-15 15:40:32', '2012-11-15 15:46:32', '2012-11-15 22:22:44', '2012-11-15 22:31:00', '2012-11-16 10:42:13')
ets3 <- c('2012-11-16 10:52:58', '2012-11-16 12:09:57', '2012-11-16 14:58:55', '2012-11-16 14:55:55', '2012-11-16 18:14:49', '2012-11-19 10:17:12', '2012-11-19 10:33:59', '2012-11-19 16:07:49', '2012-11-19 21:59:01', '2012-11-19 22:24:58', '2012-11-19 22:31:59', '2012-11-20 10:21:02', '2012-11-20 10:37:51', '2012-11-20 11:14:44', '2012-11-20 13:28:35', '2012-11-20 14:40:16', '2012-11-20 15:10:16', '2012-11-21 10:06:50', '2012-11-21 15:14:47', '2012-11-21 15:30:55', '2012-11-21 17:55:48', '2012-11-22 01:45:42', '2012-11-22 10:25:45', '2012-11-22 10:59:45', '2012-11-22 11:10:30', '2012-11-22 12:09:07', '2012-11-22 15:12:19', '2012-11-22 15:26:18', '2012-11-22 16:51:54', '2012-11-22 18:17:25', '2012-11-23 00:41:13', '2012-11-23 10:28:57', '2012-11-23 10:01:57', '2012-11-23 12:57:33', '2012-11-23 14:20:47', '2012-11-23 14:38:47', '2012-11-23 16:57:43', '2012-11-23 18:06:38', '2012-11-26 10:37:28', '2012-11-26 10:05:28', '2012-11-27 10:30:43', '2012-11-27 10:44:43', '2012-11-27 11:54:59', '2012-11-27 19:46:56', '2012-11-28 09:33:52', '2012-11-28 09:44:52', '2012-11-28 12:57:42', '2012-11-28 13:38:45', '2012-11-28 19:45:20', '2012-11-28 22:18:43', '2012-11-30 11:45:40', '2012-11-30 11:51:40', '2012-11-30 15:05:54', '2012-11-30 15:20:00', '2012-11-30 17:42:59', '2012-11-30 22:15:48', '2012-11-30 22:39:48', '2011-11-28 10:42:55', '2011-11-28 10:49:00', '2011-11-28 14:41:54', '2011-11-28 15:04:00', '2011-11-28 20:20:59', '2011-11-29 12:10:37', '2011-11-29 12:19:00', '2011-11-29 13:25:32', '2011-11-29 15:22:57', '2011-11-29 15:49:00', '2011-11-29 16:32:23', '2011-11-30 10:27:58', '2011-11-30 10:38:58', '2011-11-30 13:05:31', '2011-11-30 14:55:24', '2011-11-30 15:11:00', '2011-11-30 16:41:09', '2011-11-30 15:08:00', '2011-12-01 11:19:43', '2011-12-01 11:29:43', '2011-12-01 14:36:38', '2011-12-02 12:31:10', '2011-12-02 12:37:10', '2011-12-02 14:50:19', '2011-12-02 15:24:00', '2011-12-02 19:40:50', '2011-12-05 12:17:58', '2011-12-05 12:26:02', '2011-12-05 15:13:55', '2011-12-05 15:42:00', '2011-12-05 19:24:16', '2011-12-06 11:32:49', '2011-12-06 11:38:51', '2011-12-06 14:55:18', '2011-12-06 15:18:18', '2011-12-06 19:22:55', '2011-12-07 11:17:14', '2011-12-07 11:22:35', '2011-12-07 14:13:34', '2011-12-07 14:41:00', '2011-12-07 20:38:18', '2011-12-08 11:28:24', '2011-12-08 11:35:55', '2011-12-09 11:03:09', '2011-12-09 11:09:08')
etsF <- c(ets1, ets2, ets3)
eatData <- tibble(case_id=eci,
activity=factor(c(ea1, ea2)),
activity_instance_id=eaii,
lifecycle_id=factor(elci),
resource=factor("UNDEFINED"),
timestamp=as.POSIXct(etsF)
)
eat_patterns <- eventlog(eatData,
case_id = "case_id",
activity_id = "activity",
activity_instance_id = "activity_instance_id",
lifecycle_id = "lifecycle_id",
timestamp = "timestamp",
resource_id = "resource")
# Create performance map
eat_patterns %>% process_map(type = performance(FUN = median, units = "hours"))
# Inspect variation in activity durations graphically
eat_patterns %>% processing_time(level = "activity") %>% plot()
# Draw dotted chart
eat_patterns %>% dotted_chart(x = "relative_day", sort = "start_day", units = "secs")
# Time per activity
# daily_activities %>% processing_time(level = "activity") %>% plot
# Average duration of recordings
# daily_activities %>% throughput_time(level="log", units = "hours")
# Missing activities
# daily_activities %>% idle_time(level="log", units = "hours")
# Distribution throughput time
# vacancies %>% throughput_time(units="days")
# Distribution throughput time per department
# vacancies %>% group_by(vacancy_department) %>% throughput_time(units="days") %>% plot()
# Repetitions of activities
# vacancies %>% number_of_repetitions(level = "activity") %>% arrange(-relative)
Chapter 3 - Event Data Processing
Filtering cases:
Filtering events - trim, frequency, label, general attribute:
Aggregating events - Is-A and Part-of:
Enriching events - mutation (adding calculated variables):
Example code includes:
# Select top 20% of cases according to trace frequency
happy_path <- filter_trace_frequency(vacancies, percentage = 0.2)
# Visualize using process map
happy_path %>% process_map(type=requency(value = "absolute_case"))
# Compute throughput time
happy_path %>% throughput_time(units="days")
# Find no_declines
no_declines <- filter_activity_presence(vacancies, activities = "Decline Candidate", reverse=TRUE)
# What is the average number of
first_hit <- filter_activity_presence(vacancies, activities = c("Send Offer", "Offer Accepted"), method="all")
# Create a performance map
first_hit %>% process_map(type=performance())
# Compute throughput time
first_hit %>% throughput_time()
# Create not_refused
not_refused <- vacancies %>% filter_precedence(antecedents = "Receive Response", consequents = "Review Non Acceptance", precedence_type = "directly_follows", filter_method = "none")
# Select longest_cases
worst_cases <- not_refused %>% filter_throughput_time(interval=c(300, NA))
# Show the different traces
worst_cases %>% trace_explorer(coverage=1)
# Select activities
disapprovals <- vacancies %>% filter_activity(activities=c("Construct Offer", "Disapprove Offer", "Revise Offer","Disapprove Revision", "Restart Procedure"))
# Explore traces
disapprovals %>% trace_explorer(coverage=0.8)
# Performance map
disapprovals %>% process_map(type = performance(FUN = sum, units = "weeks"))
# Select cases
high_paid <- vacancies %>% filter(vacancy_department=="R&D", vacancy_salary_range==">100000")
# Most active resources
high_paid %>% resource_frequency(level="resource")
# Create a dotted chart
high_paid %>% dotted_chart(x="absolute", sort="start")
# Filtered dotted chart
library(lubridate)
high_paid %>% filter_time_period(interval = ymd(c("20180321","20180620")), filter_method = "trim") %>% dotted_chart(x="absolute", sort="start")
# Count activities and instances
n_activities(vacancies)
n_activity_instances(vacancies)
# Combine activities
united_vacancies <- vacancies %>%
act_unite("Disapprove Contract Offer" = c("Disapprove Offer","Disapprove Revision"),
"Approve Contract Offer" = c("Approve Offer","Approve Revision"),
"Construct Contract Offer" = c("Construct Offer","Revise Offer")
)
# Count activities and instances
n_activities(united_vacancies)
n_activity_instances(united_vacancies)
# Aggregate sub processes
aggregated_vacancies <- act_collapse(united_vacancies,
"Interviews" = c("First Interview","Second Interview","Third Interview"),
"Prepare Recruitment" = c("Publish Position","File Applications","Check References"),
"Create Offer" = c("Construct Contract Offer", "Disapprove Contract Offer", "Approve Contract Offer")
)
# Calculated number of activities and activity instances
n_activities(aggregated_vacancies)
n_activity_instances(aggregated_vacancies)
# Create performance map
aggregated_vacancies %>% process_map(type=performance())
# Add total_cost
vacancies_cost <- vacancies %>%
group_by_case() %>%
mutate(total_cost = sum(activity_cost, na.rm = TRUE))
# Add cost_impact
vacancies_impact <- vacancies_cost %>%
# Compute throughput time per impact
vacancies_impact %>% group_by(cost_impact) %>% throughput_time(units = "weeks") %>% plot()
# Create cost_profile
vacancies_profile <- vacancies_impact %>%
mutate(cost_profile = case_when(cost_impact == "High" & urgency < 7 ~ "Disproportionate",
cost_impact == "Medium" & urgency < 5 ~ "Excessive",
cost_impact == "Low" & urgency > 6 ~ "Lacking",
TRUE ~ "Appropriate"))
# Compare number of cases
vacancies_profile %>%
group_by(cost_profile) %>%
n_cases()
# Explore lacking traces
vacancies_profile %>%
filter(cost_profile == "Lacking") %>%
process_map()
Chapter 4 - Case Study
Preparing the event data - example includes data from Sales, Purchasing, Manufacturing, Packaging & Delivery, Accounting:
Getting to know the process:
Roles and rules:
Fast production, fast delivery:
Course recap:
Example code includes:
quotations <- readRDS("./RInputFiles/otc_quotations.RDS")
# Inspect quotations
str(quotations)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1833 obs. of 17 variables:
## $ quotation_id : chr "quo-1003" "quo-1004" "quo-1006" "quo-1008" ...
## $ cancelled_at : chr "2017-05-22 13:28:04" NA NA NA ...
## $ cancelled_by : Factor w/ 20 levels "Amy","Andrea",..: 10 NA NA NA 8 NA NA NA NA NA ...
## $ manufactContacted_at : chr "2017-04-22 17:58:11" "2017-06-18 13:47:50" "2017-10-28 13:55:51" NA ...
## $ manufactContacted_by : Factor w/ 20 levels "Amy","Andrea",..: 11 11 11 NA NA NA 11 14 NA NA ...
## $ received_at : chr "2017-04-16 20:34:12" "2017-06-09 11:19:31" "2017-10-14 18:55:47" "2017-09-08 13:29:05" ...
## $ received_by : Factor w/ 20 levels "Amy","Andrea",..: 2 8 8 8 8 8 10 8 2 2 ...
## $ reminded_at : chr "2017-05-14 19:06:41" NA NA NA ...
## $ reminded_by : Factor w/ 20 levels "Amy","Andrea",..: 8 NA NA NA 8 NA 8 8 NA NA ...
## $ send_at : chr "2017-05-08 14:20:30" "2017-07-02 18:50:58" "2017-11-09 11:27:11" NA ...
## $ send_by : Factor w/ 20 levels "Amy","Andrea",..: 10 2 2 NA 2 NA 2 2 NA 2 ...
## $ supplierContacted_at : chr "2017-04-29 13:43:18" "2017-06-20 12:19:31" "2017-10-26 18:06:29" NA ...
## $ supplierContacted_by : Factor w/ 20 levels "Amy","Andrea",..: 14 11 11 NA 11 NA 11 14 NA 14 ...
## $ supplierOfferReceived_at: chr "2017-05-03 19:09:21" "2017-06-23 19:33:10" "2017-10-30 10:36:44" NA ...
## $ supplierOfferReceived_by: Factor w/ 20 levels "Amy","Andrea",..: 14 11 14 NA 14 NA 14 14 NA 14 ...
## $ warehouseContacted_at : chr "2017-04-24 19:36:10" "2017-06-15 19:30:07" "2017-10-22 17:57:26" NA ...
## $ warehouseContacted_by : Factor w/ 20 levels "Amy","Andrea",..: 11 11 11 NA 14 NA 11 14 NA 14 ...
# Create offer_history
offer_history <- quotations %>%
gather(key, value, -quotation_id) %>%
separate(key, into = c("activity", "info"))
## Warning: attributes are not identical across measure variables;
## they will be dropped
# Recode the key variable
offer_history <- offer_history %>%
mutate(info = fct_recode(info, "timestamp" = 'at', "resource" = 'by'))
# Spread the info variable
offer_history <- offer_history %>%
spread(info, value)
validations <- readRDS("./RInputFiles/otc_validations.RDS")
# Inspect validations
str(validations)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1833 obs. of 4 variables:
## $ quotation_id: chr "quo-1003" "quo-1004" "quo-1006" "quo-1008" ...
## $ resource : chr "Jonathan" "Andrea" "Katherine" "Andrea" ...
## $ started : chr "2017-04-17 14:59:08" "2017-06-11 13:10:45" "2017-10-16 15:59:18" "2017-09-09 17:58:39" ...
## $ completed : chr "2017-04-19 18:32:57" "2017-06-13 12:18:57" "2017-10-18 16:21:56" "2017-09-12 20:58:14" ...
# Create validate_history
validate_history <- validations %>%
mutate(
activity = "Validate",
action = paste(quotation_id, "validate", sep = "-"))
# Gather the timestamp columns
validate_history <- validate_history %>%
gather(lifecycle, timestamp, started, completed)
# Recode the lifecycle column of validate_history
validate_history <- validate_history %>%
mutate(lifecycle = fct_recode(lifecycle,
"start" = "started",
"complete" = "completed"))
# Add lifecycle and action column to offer_history
offer_history <- offer_history %>%
mutate(
lifecycle = "complete",
action = paste(quotation_id, 1:n(), sep = "-"))
# Create sales_history
sales_history <- bind_rows(validate_history, offer_history)
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
sales_history <- readRDS("./RInputFiles/otc_sales_history.RDS")
order_history <- readRDS("./RInputFiles/otc_order_history.RDS")
# sales_quotations <- readRDS("./RInputFiles/otc_sales_quotation.RDS")
str(sales_history)
## Classes 'tbl_df', 'tbl' and 'data.frame': 14695 obs. of 7 variables:
## $ quotation_id : chr "quo-1003" "quo-1004" "quo-1006" "quo-1008" ...
## $ resource : chr "Jonathan" "Andrea" "Katherine" "Andrea" ...
## $ activity : chr "Validate" "Validate" "Validate" "Validate" ...
## $ action : chr "quo-1003-validate" "quo-1004-validate" "quo-1006-validate" "quo-1008-validate" ...
## $ lifecycle : chr "start" "start" "start" "start" ...
## $ timestamp : chr "2017-04-17 14:59:08" "2017-06-11 13:10:45" "2017-10-16 15:59:18" "2017-09-09 17:58:39" ...
## $ sales_order_id: chr NA "order-17-56548" "order-17-56550" NA ...
str(order_history)
## Classes 'tbl_df', 'tbl' and 'data.frame': 60804 obs. of 8 variables:
## $ sales_order_id: chr "order-17-56542" "order-17-56542" "order-17-56543" "order-17-56543" ...
## $ action : chr "order-17-56542-0000001" "order-17-56542-0000002" "order-17-56543-0000003" "order-17-56543-0000004" ...
## $ activity : Factor w/ 37 levels "Assemble Order",..: 24 35 24 35 24 35 24 35 24 35 ...
## $ resource : Factor w/ 20 levels "Amy","Andrea",..: 10 8 2 8 2 8 10 8 2 8 ...
## $ status : Factor w/ 2 levels "complete","start": 2 2 2 2 2 2 2 2 2 2 ...
## $ time : POSIXct, format: "2017-10-17 12:37:22" "2017-10-19 15:30:40" ...
## $ activity_cost : num NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ...
## $ quotation_id : chr NA NA NA NA ...
# str(sales_quotations)
order_history <- order_history %>%
rename(timestamp=time, lifecycle=status) %>%
select(-activity_cost) %>%
mutate(activity=as.character(activity),
resource=as.character(activity),
lifecycle=as.character(lifecycle)
)
sales_history <- sales_history %>%
mutate(timestamp=lubridate::as_datetime(timestamp))
# sales_history <- sales_history %>% left_join(sales_quotations)
otc <- bind_rows(sales_history, order_history)
# Create the eventlog object
otc <- otc %>%
mutate(case_id = paste(quotation_id, sales_order_id, sep = "-")) %>%
eventlog(
case_id = "case_id",
activity_id = "activity",
activity_instance_id = "action",
timestamp = "timestamp",
resource_id = "resource",
lifecycle_id = "lifecycle"
)
# Create trace coverage graph
trace_coverage(otc, level="trace") %>% plot()
# Explore traces
otc %>%
trace_explorer(coverage = 0.25)
# Collapse activities
otc_high_level <- act_collapse(otc, "Delivery" = c(
"Handover To Deliverer",
"Order Delivered",
"Present For Collection",
"Order Fetched")
)
# Draw a process map
process_map(otc_high_level)
# Redraw the trace coverage graph
otc_high_level %>% trace_coverage(level="trace") %>% plot()
# Compute activity wise processing time
otc_high_level %>% processing_time(level="activity", units="days")
## # A tibble: 34 x 11
## activity min q1 mean median q3 max st_dev iqr total
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Packaging 0 0 0 0 0 0 0 0 0
## 2 Prepare Invoice 0 0 0 0 0 0 0 0 0
## 3 Produce Order 0 0 0 0 0 0 0 0 0
## 4 Quality Control 0 0 0 0 0 0 0 0 0
## 5 Assemble Order 0 0 0 0 0 0 0 0 0
## 6 Delivery 0.583 1.99 5.11 3.11 8.06 17.0 3.86 6.07 15452
## 7 Order Materials 0 0 0 0 0 0 0 0 0
## 8 Receive Materi~ 0 0 0 0 0 0 0 0 0
## 9 Receive Sales ~ 0 0 0 0 0 0 0 0 0
## 10 Schedule Job 0 0 0 0 0 0 0 0 0
## # ... with 24 more rows, and 1 more variable: relative_frequency <dbl>
# Plot a resource activity matrix of otc
otc %>% resource_frequency(level = "resource-activity") %>% plot()
# Create otc_selection
otc_selection <- otc %>% filter_activity(activities = c("Send Quotation","Send Invoice"))
# Explore traces
otc %>% trace_explorer(coverage=1)
# Draw a resource map
otc_selection %>% resource_map()
# Create otc_returned
otc_returned <- otc %>% filter_activity_presence("Return Goods")
# Compute percentage of returned orders
n_cases(otc_returned)/n_cases(otc)
## [1] 0.2130923
# Trim cases and visualize
otc_returned %>% filter_trim(start_activities="Return Goods") %>% process_map()
# Time from order to delivery
# otc %>% filter_trim(start_activities="Receive Sales Order", end_activities="Order Delivered") %>%
# processing_time(units="days")
# Plot processing time by type
# otc %>%
# group_by(type) %>%
# throughput_time() %>%
# plot()
Chapter 1 - Hubs of the Network
Network science - include social networks, neural networks, etc.:
Visualizing networks:
Centrality measures:
Example code includes:
# read the nodes file into the variable nodes
nodes <- readr::read_csv("./RInputFiles/nodes.csv")
nodes
# read the ties file into the variable ties
ties <- readr::read_csv("./RInputFiles/ties.csv")
ties
library(igraph)
library(ggraph)
# make the network from the data frame ties and print it
g <- graph_from_data_frame(ties, directed = FALSE, vertices = nodes)
g
# explore the set of nodes
V(g)
# print the number of nodes
vcount(g)
# explore the set of ties
E(g)
# print the number of ties
ecount(g)
# give the name "Madrid network" to the network and print the network `name` attribute
g$name <- "Madrid network"
g$name
# add node attribute id and print the node `id` attribute
V(g)$id <- 1:vcount(g)
V(g)$id
# print the tie `weight` attribute
E(g)$weight
# print the network and spot the attributes
g
# visualize the network with layout Kamada-Kawai
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = weight)) +
geom_node_point()
# add an id label to nodes
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = weight)) +
geom_node_point() +
geom_node_text(aes(label = id), repel=TRUE)
# visualize the network with circular layout. Set tie transparency proportional to its weight
ggraph(g, layout = "in_circle") +
geom_edge_link(aes(alpha = weight)) +
geom_node_point()
# visualize the network with grid layout. Set tie transparency proportional to its weight
ggraph(g, layout = "grid") +
geom_edge_link(aes(alpha = weight)) +
geom_node_point()
# compute the degrees of the nodes
dgr <- degree(g)
# add the degrees to the data frame object
nodes <- mutate(nodes, degree = dgr)
# add the degrees to the network object
V(g)$degree <- dgr
# arrange the terrorists in decreasing order of degree
arrange(nodes, -degree)
# compute node strengths
stg <- strength(g)
# add strength to the data frame object using mutate
nodes <- mutate(nodes, strength = stg)
# add the variable stg to the network object as strength
V(g)$strength <- stg
# arrange terrorists in decreasing order of strength and then in decreasing order of degree
arrange(nodes, -degree)
arrange(nodes, -strength)
Chapter 2 - Weakness and strength
Tie betweenness:
Visualizing centrality measures:
The strength of weak ties:
Example code includes:
# save the inverse of tie weights as dist_weight
dist_weight <- 1 / E(g)$weight
# compute weighted tie betweenness
btw <- edge_betweenness(g, weights = dist_weight)
# mutate the data frame ties adding a variable betweenness using btw
ties <- mutate(ties, betweenness=btw)
# add the tie attribute betweenness to the network
E(g)$betweenness <- btw
# join ties with nodes
ties_joined <- ties %>%
left_join(nodes, c("from" = "id")) %>%
left_join(nodes, c("to" = "id"))
# select only relevant variables and save to ties
ties_selected <- ties_joined %>%
select(from, to, name_from = name.x, name_to = name.y, betweenness)
# arrange named ties in decreasing order of betweenness
arrange(ties_selected, -betweenness)
# set (alpha) proportional to weight and node size proportional to degree
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha=weight)) +
geom_node_point(aes(size=degree))
# produce the same visualization but set node size proportional to strength
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = weight)) +
geom_node_point(aes(size = strength))
# visualize the network with tie transparency proportional to betweenness
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = betweenness)) +
geom_node_point()
# add node size proportional to degree
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = betweenness)) +
geom_node_point(aes(size = degree))
# find median betweenness
q = median(E(g)$betweenness)
# filter ties with betweenness larger than the median
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = betweenness, filter = (betweenness > q))) +
geom_node_point() +
theme(legend.position="none")
# find number and percentage of weak ties
ties %>%
group_by(weight) %>%
summarise(number = n(), percentage=n()/nrow(.)) %>%
arrange(-number)
# build vector weakness containing TRUE for weak ties
weakness <- ifelse(ties$weight == 1, TRUE, FALSE)
# check that weakness contains the correct number of weak ties
sum(weakness)
# visualize the network by coloring the weak and strong ties
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(color = weakness)) +
geom_node_point()
# visualize the network with only weak ties using the filter aesthetic
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(filter=weakness), alpha = 0.5) +
geom_node_point()
Chapter 3 - Connection patterns
Connection patterns:
Pearson correlation coefficient:
Most similar and most dissimilar terrorists:
Example code includes:
# mutate ties data frame by swapping variables from and to
ties_mutated <- mutate(ties, temp = to, to = from, from = temp) %>% select(-temp)
# append ties_mutated data frame to ties data frame
ties <- rbind(ties, ties_mutated)
# use a scatter plot to visualize node connection patterns in ties setting color aesthetic to weight
ggplot(ties, aes(x = from, y = to, color = factor(weight))) +
geom_point() +
labs(color = "weight")
# get the weighted adjacency matrix
A <- as_adjacency_matrix(g, attr = "weight", sparse = FALSE, names = FALSE)
# print the first row and first column of A
A[1, ]
A[, 1]
# print submatrix of the first 6 rows and columns
A[1:6, 1:6]
# obtain a vector of node strengths
rowSums(A)
# build a Boolean (0/1) matrix from the weighted matrix A
B <- ifelse(A > 0, 1, 0)
# obtain a vector of node degrees using the Boolean matrix
rowSums(B)
# compute the Pearson correlation on columns of A
S <- cor(A)
# set the diagonal of S to 0
diag(S) = 0
# print a summary of the similarities in matrix S
summary(c(S))
# plot a histogram of similarities in matrix S
hist(c(S), xlab = "Similarity", main = "Histogram of similarity")
# Scatter plot of degree and strength with regression line
ggplot(nodes, aes(x = degree, y = strength)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# Pearson correlation coefficient
cor(nodes$degree, nodes$strength)
# build weighted similarity network and save to h
h <- graph_from_adjacency_matrix(S, mode = "undirected", weighted = TRUE)
# convert the similarity network h into a similarity data frame sim_df
sim_df <- as_data_frame(h, what = "edges")
# map the similarity data frame to a tibble and save it as sim_tib
sim_tib <- as_tibble(sim_df)
# print sim_tib
sim_tib
# left join similarity and nodes data frames and then select and rename relevant variables
sim2 <- sim_tib %>%
left_join(nodes, c("from" = "id")) %>%
left_join(nodes, c("to" = "id")) %>%
select(from, to, name_from = name.x, name_to = name.y, similarity = weight,
degree_from = degree.x, degree_to = degree.y, strength_from = strength.x, strength_to = strength.y)
# print sim2
sim2
# arrange sim2 in decreasing order of similarity.
sim2 %>% arrange(-similarity)
# filter sim2, allowing only pairs with a degree of least 10, arrange the result in decreasing order of similarity
sim2 %>%
filter(degree_from >= 10, degree_to >= 10) %>%
arrange(-similarity)
# Repeat the previous steps, but in increasing order of similarity
sim2 %>%
filter(degree_from >= 10, degree_to >= 10) %>%
arrange(similarity)
# filter the similarity data frame to similarities larger than or equal to 0.60
sim3 <- filter(sim2, similarity >= 0.6)
# build a similarity network called h2 from the filtered similarity data frame
h2 <- graph_from_data_frame(sim3, directed = FALSE)
# visualize the similarity network h2
ggraph(h2, layout = "with_kk") +
geom_edge_link(aes(alpha = similarity)) +
geom_node_point()
Chapter 4 - Similarity Clusters
Hierarchical clustering - find clusters of similar people:
Interactive visualizations with visNetwork:
Wrap up:
Example code includes:
# compute a distance matrix
D <- 1 - S
# obtain a distance object
d <- as.dist(D)
# run average-linkage clustering method and plot the dendrogram
cc <- hclust(d, method = "average")
plot(cc)
# find the similarity of the first pair of nodes that have been merged
S[40, 45]
# cut the dendrogram at 4 clusters
cls <- cutree(cc, k = 4)
# add cluster information to the nodes data frame
nodes <- mutate(nodes, cluster = cls)
# print the nodes data frame
nodes
# output the names of terrorists in the first cluster
filter(nodes, cluster == 1) %>%
select(name)
# for each cluster select the size of the cluster, the average node degree, and the average node strength and sorts by cluster size
group_by(nodes, cluster) %>%
summarise(size = n(),
avg_degree = mean(degree),
avg_strength = mean(strength)
) %>%
arrange(-size)
# add cluster information to the network
V(g)$cluster <- nodes$cluster
# visualize the original network with colored clusters
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = weight), show.legend=FALSE) +
geom_node_point(aes(color = factor(cluster))) +
labs(color = "cluster")
# facet the network with respect to cluster attribute
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = weight), show.legend=FALSE) +
geom_node_point(aes(color = factor(cluster))) +
facet_nodes(~cluster, scales="free") +
labs(color = "cluster")
# convert igraph to visNetwork
data <- visNetwork::toVisNetworkData(g)
# print head of nodes and ties
head(data$nodes)
head(data$edges)
# visualize the network
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300)
# use the circle layout
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_with_kk")
# use the circle layout
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_in_circle")
# use the grid layout
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_on_grid")
# highlight nearest nodes and ties of the selected node
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_with_kk") %>%
visNetwork::visOptions(highlightNearest = TRUE)
# select nodes by id
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_with_kk") %>%
visNetwork::visOptions(nodesIdSelection = TRUE)
# set color to cluster and generate network data
V(g)$color = V(g)$cluster
data <- visNetwork::toVisNetworkData(g)
# select by group (cluster)
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_with_kk") %>%
visNetwork::visOptions(selectedBy = "group")
Chapter 1 - Introduction to Data Privacy
Intro to Anonymization - Part I:
Intro to Anonymization - Part II:
Data Synthesis:
Example code includes:
load("./RInputFiles/dataPriv.RData")
# Preview data
whitehouse
## # A tibble: 469 x 5
## Name Status Salary Basis Title
## <chr> <chr> <dbl> <chr> <chr>
## 1 Abrams, Ada~ Employ~ 66300 Per An~ WESTERN REGIONAL COMMUNICATIONS DI~
## 2 Adams, Ian ~ Employ~ 45000 Per An~ EXECUTIVE ASSISTANT TO THE DIRECTO~
## 3 Agnew, Davi~ Employ~ 93840 Per An~ DEPUTY DIRECTOR OF INTERGOVERNMENT~
## 4 Albino, Jam~ Employ~ 91800 Per An~ SENIOR PROGRAM MANAGER
## 5 Aldy, Jr., ~ Employ~ 130500 Per An~ SPECIAL ASSISTANT TO THE PRESIDENT~
## 6 Alley, Hila~ Employ~ 42000 Per An~ STAFF ASSISTANT
## 7 Amorsingh, ~ Employ~ 56092 Per An~ SPECIAL ASSISTANT
## 8 Anderson, A~ Employ~ 60000 Per An~ SPECIAL ASSISTANT TO THE CHIEF OF ~
## 9 Anderson, C~ Employ~ 51000 Per An~ POLICY ASSISTANT
## 10 Andrias, Ka~ Employ~ 130500 Per An~ SPECIAL ASSISTANT TO THE PRESIDENT~
## # ... with 459 more rows
# Set seed
set.seed(42)
# Replace names with random numbers from 1 to 1000
whitehouse_no_names <- whitehouse %>%
mutate(Name = sample(1:1000, nrow(.), replace=FALSE))
whitehouse_no_names
## # A tibble: 469 x 5
## Name Status Salary Basis Title
## <int> <chr> <dbl> <chr> <chr>
## 1 915 Employee 66300 Per An~ WESTERN REGIONAL COMMUNICATIONS DIRECTOR
## 2 937 Employee 45000 Per An~ EXECUTIVE ASSISTANT TO THE DIRECTOR OF S~
## 3 286 Employee 93840 Per An~ DEPUTY DIRECTOR OF INTERGOVERNMENTAL AFF~
## 4 828 Employee 91800 Per An~ SENIOR PROGRAM MANAGER
## 5 640 Employee 130500 Per An~ SPECIAL ASSISTANT TO THE PRESIDENT FOR E~
## 6 517 Employee 42000 Per An~ STAFF ASSISTANT
## 7 733 Employee 56092 Per An~ SPECIAL ASSISTANT
## 8 134 Employee 60000 Per An~ SPECIAL ASSISTANT TO THE CHIEF OF STAFF
## 9 652 Employee 51000 Per An~ POLICY ASSISTANT
## 10 699 Employee 130500 Per An~ SPECIAL ASSISTANT TO THE PRESIDENT AND A~
## # ... with 459 more rows
# Rounding Salary to the nearest ten thousand
whitehouse_no_identifiers <- whitehouse_no_names %>%
mutate(Salary = round(Salary, -4))
whitehouse_no_identifiers
## # A tibble: 469 x 5
## Name Status Salary Basis Title
## <int> <chr> <dbl> <chr> <chr>
## 1 915 Employee 70000 Per An~ WESTERN REGIONAL COMMUNICATIONS DIRECTOR
## 2 937 Employee 40000 Per An~ EXECUTIVE ASSISTANT TO THE DIRECTOR OF S~
## 3 286 Employee 90000 Per An~ DEPUTY DIRECTOR OF INTERGOVERNMENTAL AFF~
## 4 828 Employee 90000 Per An~ SENIOR PROGRAM MANAGER
## 5 640 Employee 130000 Per An~ SPECIAL ASSISTANT TO THE PRESIDENT FOR E~
## 6 517 Employee 40000 Per An~ STAFF ASSISTANT
## 7 733 Employee 60000 Per An~ SPECIAL ASSISTANT
## 8 134 Employee 60000 Per An~ SPECIAL ASSISTANT TO THE CHIEF OF STAFF
## 9 652 Employee 50000 Per An~ POLICY ASSISTANT
## 10 699 Employee 130000 Per An~ SPECIAL ASSISTANT TO THE PRESIDENT AND A~
## # ... with 459 more rows
# Convert the salaries into three categories
whitehouse.gen <- whitehouse %>%
mutate(Salary = ifelse(Salary < 50000, 0,
ifelse(Salary >= 50000 & Salary < 100000, 1, 2)))
whitehouse.gen
## # A tibble: 469 x 5
## Name Status Salary Basis Title
## <chr> <chr> <dbl> <chr> <chr>
## 1 Abrams, Ada~ Employ~ 1 Per An~ WESTERN REGIONAL COMMUNICATIONS DI~
## 2 Adams, Ian ~ Employ~ 0 Per An~ EXECUTIVE ASSISTANT TO THE DIRECTO~
## 3 Agnew, Davi~ Employ~ 1 Per An~ DEPUTY DIRECTOR OF INTERGOVERNMENT~
## 4 Albino, Jam~ Employ~ 1 Per An~ SENIOR PROGRAM MANAGER
## 5 Aldy, Jr., ~ Employ~ 2 Per An~ SPECIAL ASSISTANT TO THE PRESIDENT~
## 6 Alley, Hila~ Employ~ 0 Per An~ STAFF ASSISTANT
## 7 Amorsingh, ~ Employ~ 1 Per An~ SPECIAL ASSISTANT
## 8 Anderson, A~ Employ~ 1 Per An~ SPECIAL ASSISTANT TO THE CHIEF OF ~
## 9 Anderson, C~ Employ~ 1 Per An~ POLICY ASSISTANT
## 10 Andrias, Ka~ Employ~ 2 Per An~ SPECIAL ASSISTANT TO THE PRESIDENT~
## # ... with 459 more rows
# Bottom Coding
whitehouse.bottom <- whitehouse %>%
mutate(Salary = pmax(Salary, 45000))
# Filter Results
whitehouse.bottom %>%
filter(Salary <= 45000)
## # A tibble: 109 x 5
## Name Status Salary Basis Title
## <chr> <chr> <dbl> <chr> <chr>
## 1 Adams, Ian H. Employ~ 45000 Per An~ EXECUTIVE ASSISTANT TO THE DIRECT~
## 2 Alley, Hilar~ Employ~ 45000 Per An~ STAFF ASSISTANT
## 3 Asen, Jonath~ Employ~ 45000 Per An~ SENIOR ANALYST
## 4 Ayling, Lind~ Employ~ 45000 Per An~ ANALYST
## 5 Baggetto, Ma~ Employ~ 45000 Per An~ STAFF ASSISTANT
## 6 Bates, Andre~ Employ~ 45000 Per An~ MEDIA MONITOR
## 7 Belive, Laur~ Employ~ 45000 Per An~ LEGISLATIVE ASSISTANT AND ASSISTA~
## 8 Bisi, Rachel~ Employ~ 45000 Per An~ LEGISLATIVE ASSISTANT
## 9 Block, Micha~ Employ~ 45000 Per An~ STAFF ASSISTANT
## 10 Blount, Patr~ Employ~ 45000 Per An~ RECORDS MANAGEMENT ANALYST
## # ... with 99 more rows
# View fertility data
fertility
## # A tibble: 100 x 10
## Season Age Child_Disease Accident_Trauma Surgical_Interv~ High_Fevers
## <dbl> <dbl> <int> <int> <int> <int>
## 1 -0.33 0.69 0 1 1 0
## 2 -0.33 0.94 1 0 1 0
## 3 -0.33 0.5 1 0 0 0
## 4 -0.33 0.75 0 1 1 0
## 5 -0.33 0.67 1 1 0 0
## 6 -0.33 0.67 1 0 1 0
## 7 -0.33 0.67 0 0 0 -1
## 8 -0.33 1 1 1 1 0
## 9 1 0.64 0 0 1 0
## 10 1 0.61 1 0 0 0
## # ... with 90 more rows, and 4 more variables: Alcohol_Freq <dbl>,
## # Smoking <int>, Hours_Sitting <dbl>, Diagnosis <int>
# Number of participants with Surgical_Intervention and Diagnosis
fertility %>%
summarise_at(vars(Surgical_Intervention, Diagnosis), sum)
## # A tibble: 1 x 2
## Surgical_Intervention Diagnosis
## <int> <int>
## 1 51 12
# Mean and Standard Deviation of Age
fertility %>%
summarise_at(vars(Age), funs(mean, sd))
## # A tibble: 1 x 2
## mean sd
## <dbl> <dbl>
## 1 0.669 0.121
# Counts of the Groups in High_Fevers
fertility %>%
count(High_Fevers)
## # A tibble: 3 x 2
## High_Fevers n
## <int> <int>
## 1 -1 9
## 2 0 63
## 3 1 28
# Counts of the Groups in Child_Disease
fertility %>%
count(Child_Disease, Accident_Trauma)
## # A tibble: 4 x 3
## Child_Disease Accident_Trauma n
## <int> <int> <int>
## 1 0 0 10
## 2 0 1 3
## 3 1 0 46
## 4 1 1 41
# Find proportions
fertility %>%
summarise_at(vars(Accident_Trauma, Surgical_Intervention), mean)
## # A tibble: 1 x 2
## Accident_Trauma Surgical_Intervention
## <dbl> <dbl>
## 1 0.44 0.51
# Set seed
set.seed(42)
# Generate Synthetic data
accident <- rbinom(100, 1, prob=0.440)
surgical <- rbinom(100, 1, prob=0.510)
# Square root Transformation of Salary
whitehouse.salary <- whitehouse %>%
mutate(Salary = sqrt(Salary))
# Calculate the mean and standard deviation
stats <- whitehouse.salary %>%
summarize(mean(Salary), sd(Salary))
stats
## # A tibble: 1 x 2
## `mean(Salary)` `sd(Salary)`
## <dbl> <dbl>
## 1 279. 71.8
# Generate Synthetic data
set.seed(42)
salary_transformed <- rnorm(nrow(whitehouse), mean=279, sd=71.8)
# Power transformation
salary_original <- salary_transformed ** 2
# Hard bound
salary <- ifelse(salary_original < 0, 0, salary_original)
Chapter 2 - Introduction to Differential Privacy
Differential Privacy - quantification of privacy loss via a privacy budget:
Global Sensitivity - usual decision-making factor for differential privacy:
Laplace Mechanism - adds noise based on the Laplace distribution with mean 0 and parameters global sensitivity and privacy budget:
Example code includes:
# Number of observations
n <- nrow(fertility)
# Global sensitivity of counts
gs.count <- 1
# Global sensitivity of proportions
gs.prop <- 1/n
# Lower bound of Hours_Sitting
a <- 0
# Upper bound of Hours_Sitting
b <- 1
# Global sensitivity of mean for Hours_Sitting
gs.mean <- (b - a) / n
# Global sensitivity of proportions Hours_Sitting
gs.var <- (b - a)**2 / n
# How many participants had a Surgical_Intervention?
fertility %>%
summarise_at(vars(Surgical_Intervention), sum)
## # A tibble: 1 x 1
## Surgical_Intervention
## <int>
## 1 51
# Set the seed
set.seed(42)
# Apply the Laplace mechanism
eps <- 0.1
smoothmest::rdoublex(1, 51, 1/eps)
## [1] 52.98337
# Proportion of Accident_Trauma
stats <- fertility %>%
summarise_at(vars(Accident_Trauma), mean)
stats
## # A tibble: 1 x 1
## Accident_Trauma
## <dbl>
## 1 0.44
# Set the seed
set.seed(42)
# Apply the Laplace mechanism
eps <- 0.1
smoothmest::rdoublex(1, 0.440, (1/n)/eps)
## [1] 0.4598337
# Mean and Variance of Hours Sitting
fertility %>%
summarise_at(vars(Hours_Sitting), funs(mean, var))
## # A tibble: 1 x 2
## mean var
## <dbl> <dbl>
## 1 0.407 0.0347
# Setup
set.seed(42)
eps <- 0.1
# Laplace mechanism to mean
smoothmest::rdoublex(1, 0.41, gs.mean/eps)
## [1] 0.4298337
# Laplace mechanism to variance
smoothmest::rdoublex(1, 0.03, gs.var/eps)
## [1] 0.0583491
Chapter 3 - Differentially Private Properties
Sequential Composition - method to require that someone cannot find the real answer by just sending multiple queries:
Parallel Composition - method to account for queries to different parts of the database (no adjustment to epsilon needed):
Post-processing:
Impossible and inconsistent answers:
Example code includes:
# Set Value of Epsilon
eps <- 0.1 / 2
# Number of observations
n <- nrow(fertility)
# Lower bound of Age
a <- 0
# Upper bound of Age
b <- 1
# GS of counts for Diagnosis
gs.count <- 1
# GS of mean for Age
gs.mean <- (b-a)/n
# Number of Participants with abnormal diagnosis
stats1 <- fertility %>%
summarize_at(vars(Diagnosis), sum)
stats1
## # A tibble: 1 x 1
## Diagnosis
## <int>
## 1 12
# Mean of age
stats2 <- fertility %>%
summarize_at(vars(Age), mean)
stats2
## # A tibble: 1 x 1
## Age
## <dbl>
## 1 0.669
# Set seed
set.seed(42)
# Laplace mechanism to the count of abnormal diagnosis
smoothmest::rdoublex(1, 12, gs.count/eps)
## [1] 15.96674
# Laplace mechanism to the mean of age
smoothmest::rdoublex(1, 0.67, gs.mean/eps)
## [1] 0.7266982
# Set Value of Epsilon
eps <- 0.1
# Mean of Age per diagnosis level
fertility %>%
group_by(Diagnosis) %>%
summarise_at(vars(Age), mean)
## # A tibble: 2 x 2
## Diagnosis Age
## <int> <dbl>
## 1 0 0.664
## 2 1 0.707
# Set the seed
set.seed(42)
# Laplace mechanism to the mean age of participants with an abnormal diagnoisis
smoothmest::rdoublex(1, 0.71, gs.mean/eps)
## [1] 0.7298337
# Laplace mechanism to the mean age of participants with a normal diagnoisis
smoothmest::rdoublex(1, 0.66, gs.mean/eps)
## [1] 0.6883491
# Set Value of Epsilon
eps <- 0.5/3
# GS of Counts
gs.count <- 1
# Number of participants in each of the four seasons
fertility %>%
group_by(Diagnosis) %>%
summarise_at(vars(Age), mean)
## # A tibble: 2 x 2
## Diagnosis Age
## <int> <dbl>
## 1 0 0.664
## 2 1 0.707
# Set the seed
set.seed(42)
# Laplace mechanism to the number of participants who were evaluated in the winter, spring, and summer
winter <- smoothmest::rdoublex(1, 28, gs.count / eps) %>%
round()
spring <- smoothmest::rdoublex(1, 37, gs.count / eps) %>%
round()
summer <- smoothmest::rdoublex(1, 4, gs.count / eps) %>%
round()
# Post-process based on previous queries
fall <- nrow(fertility) - winter - spring - summer
# Set Value of Epsilon
eps <- 0.01
# GS of counts
gs.count <- 1
# Number of Participants with Child_Disease
fertility %>%
summarise_at(vars(Child_Disease), sum)
## # A tibble: 1 x 1
## Child_Disease
## <int>
## 1 87
# Apply the Laplace mechanism
set.seed(42)
lap_childhood <- smoothmest::rdoublex(1, 87, gs.count / eps) %>%
round()
# Total number of observations in fertility
max_value <- nrow(fertility)
# Bound the value such that the noisy answer does not exceed the total number of observations
ifelse(lap_childhood > max_value, max_value, lap_childhood)
## [1] 100
# Set the seed
set.seed(42)
# Apply the Laplace mechanism
fever1 <- smoothmest::rdoublex(1, 9, gs.count/eps) %>%
max(0)
fever2 <- smoothmest::rdoublex(1, 63, gs.count/eps) %>%
max(0)
fever3 <- smoothmest::rdoublex(1, 28, gs.count/eps) %>%
max(0)
fever <- c(fever1, fever2, fever3)
# Normalize noise
fever_normalized <- (fever/sum(fever)) * (nrow(fertility))
# Round the values
round(fever_normalized)
## [1] 24 76 0
Chapter 4 - Differentially Private Data Synthesis
Laplace Sanitizer - basic way to generate “noisy” categorical data:
Parametric Approaches:
Wrap up:
Example code includes:
# Set Value of Epsilon
eps <- 0.1
# GS of Counts
gs.count <- 1
# Number of participants in each season
fertility %>%
count(Season)
## # A tibble: 4 x 2
## Season n
## <dbl> <int>
## 1 -1 28
## 2 -0.33 37
## 3 0.33 4
## 4 1 31
# Set the seed
set.seed(42)
# Apply the Laplace mechanism
winter <- smoothmest::rdoublex(1, 28, gs.count/eps) %>% max(0)
spring <- smoothmest::rdoublex(1, 37, gs.count/eps) %>% max(0)
summer <- smoothmest::rdoublex(1, 4, gs.count/eps) %>% max(0)
fall <- smoothmest::rdoublex(1, 31, gs.count/eps) %>% max(0)
# Store noisy results
seasons <- c(winter = winter, spring = spring, summer = summer, fall = fall)
# Normalizing seasons
seasons_normalized <- (seasons/sum(seasons)) * nrow(fertility)
# Round the values
round(seasons_normalized)
## winter spring summer fall
## 29 38 0 33
# Generate synthetic data for winter
rep(-1, 29)
## [1] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [24] -1 -1 -1 -1 -1 -1
# Generate synthetic data for spring
rep(-0.33, 38)
## [1] -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33
## [12] -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33
## [23] -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33 -0.33
## [34] -0.33 -0.33 -0.33 -0.33 -0.33
# Generate synthetic data for summer
rep(0.33, 0)
## numeric(0)
# Generate synthetic data for fall
rep(1, 33)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# Calculate proportions
fertility %>%
summarise_at(vars(Accident_Trauma, Surgical_Intervention), mean)
## # A tibble: 1 x 2
## Accident_Trauma Surgical_Intervention
## <dbl> <dbl>
## 1 0.44 0.51
# Number of Observations
n <- nrow(fertility)
# Set Value of Epsilon
eps <- 0.1
# GS of Proportion
gs.prop <- (1/n)
# Apply the Laplace mechanism
set.seed(42)
smoothmest::rdoublex(1, 0.44, gs.prop/eps)
## [1] 0.4598337
smoothmest::rdoublex(1, 0.51, gs.prop/eps)
## [1] 0.5383491
# Generate Synthetic data
set.seed(42)
accident <- rbinom(n, 1, 0.46)
surgical <- rbinom(n, 1, 0.54)
# Set Value of Epsilon
eps <- 0.1 / 2
# Number of observations
n <- nrow(fertility)
# Upper and lower bounds of age
a <- 0
b <- 1
# GS of mean and variance for age
gs.mean <- (b-a) / n
gs.var <- (b-a)**2 / n
# Mean and Variance of Age
fertility %>%
summarise_at(vars(Age), funs(mean, var))
## # A tibble: 1 x 2
## mean var
## <dbl> <dbl>
## 1 0.669 0.0147
# Apply the Laplace mechanism
set.seed(42)
smoothmest::rdoublex(1, 0.67, gs.mean/eps)
## [1] 0.7096674
smoothmest::rdoublex(1, 0.01, gs.var/eps)
## [1] 0.06669821
# Generate Synthetic data
set.seed(42)
age <- rnorm(n, mean=0.71, sd=sqrt(0.07))
# Hard Bounding the data
age[age < 0] <- 0
age[age > 1] <- 1
Chapter 1 - Modeling Customer Lifetime Value with Linear Regression
Introduction - Verena from INWT Statistics (consultancy in marketing analytics):
Simple linear regression - one predictor variable to predict one response variable:
Multiple linear regression:
Model validation, fit, and prediction:
Example code includes:
salesData <- readr::read_csv("./RInputFiles/salesData.csv")
## Parsed with column specification:
## cols(
## id = col_integer(),
## nItems = col_integer(),
## mostFreqStore = col_character(),
## mostFreqCat = col_character(),
## nCats = col_integer(),
## preferredBrand = col_character(),
## nBrands = col_integer(),
## nPurch = col_integer(),
## salesLast3Mon = col_double(),
## salesThisMon = col_double(),
## daysSinceLastPurch = col_integer(),
## meanItemPrice = col_double(),
## meanShoppingCartValue = col_double(),
## customerDuration = col_integer()
## )
# Structure of dataset
str(salesData, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 5122 obs. of 14 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ nItems : int 1469 1463 262 293 108 216 174 122 204 308 ...
## $ mostFreqStore : chr "Stockton" "Stockton" "Colorado Springs" "Colorado Springs" ...
## $ mostFreqCat : chr "Alcohol" "Alcohol" "Shoes" "Bakery" ...
## $ nCats : int 72 73 55 50 32 41 36 31 41 52 ...
## $ preferredBrand : chr "Veina" "Veina" "Bo" "Veina" ...
## $ nBrands : int 517 482 126 108 79 98 78 62 99 103 ...
## $ nPurch : int 82 88 56 43 18 35 34 12 26 33 ...
## $ salesLast3Mon : num 2742 2791 1530 1766 1180 ...
## $ salesThisMon : num 1284 1243 683 730 553 ...
## $ daysSinceLastPurch : int 1 1 1 1 12 2 2 4 14 1 ...
## $ meanItemPrice : num 1.87 1.91 5.84 6.03 10.93 ...
## $ meanShoppingCartValue: num 33.4 31.7 27.3 41.1 65.6 ...
## $ customerDuration : int 821 657 548 596 603 673 612 517 709 480 ...
# Visualization of correlations
salesData %>% select_if(is.numeric) %>%
select(-id) %>%
cor() %>%
corrplot::corrplot()
# Frequent stores
ggplot(salesData) +
geom_boxplot(aes(x = mostFreqStore, y = salesThisMon))
# Preferred brand
ggplot(salesData) +
geom_boxplot(aes(x = preferredBrand, y = salesThisMon))
# Model specification using lm
salesSimpleModel <- lm(salesThisMon ~ salesLast3Mon, data = salesData)
# Looking at model summary
summary(salesSimpleModel)
##
## Call:
## lm(formula = salesThisMon ~ salesLast3Mon, data = salesData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -570.18 -68.26 3.21 72.98 605.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.690501 6.083886 16.39 <2e-16 ***
## salesLast3Mon 0.382696 0.004429 86.40 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 117.5 on 5120 degrees of freedom
## Multiple R-squared: 0.5932, Adjusted R-squared: 0.5931
## F-statistic: 7465 on 1 and 5120 DF, p-value: < 2.2e-16
# Estimating the full model
salesModel1 <- lm(salesThisMon ~ . -id, data = salesData)
# Checking variance inflation factors
car::vif(salesModel1)
## GVIF Df GVIF^(1/(2*Df))
## nItems 11.772600 1 3.431122
## mostFreqStore 1.260469 9 1.012943
## mostFreqCat 1.527348 9 1.023809
## nCats 8.402073 1 2.898633
## preferredBrand 1.682184 9 1.029316
## nBrands 14.150868 1 3.761764
## nPurch 3.083952 1 1.756119
## salesLast3Mon 8.697663 1 2.949180
## daysSinceLastPurch 1.585057 1 1.258991
## meanItemPrice 1.987665 1 1.409846
## meanShoppingCartValue 2.247579 1 1.499193
## customerDuration 1.004664 1 1.002329
# Estimating new model by removing information on brand
salesModel2 <- lm(salesThisMon ~ . -id -preferredBrand -nBrands, data = salesData)
# Checking variance inflation factors
car::vif(salesModel2)
## GVIF Df GVIF^(1/(2*Df))
## nItems 6.987456 1 2.643380
## mostFreqStore 1.178251 9 1.009154
## mostFreqCat 1.269636 9 1.013351
## nCats 5.813494 1 2.411119
## nPurch 3.069046 1 1.751869
## salesLast3Mon 8.412520 1 2.900435
## daysSinceLastPurch 1.579426 1 1.256752
## meanItemPrice 1.925494 1 1.387622
## meanShoppingCartValue 2.238410 1 1.496132
## customerDuration 1.002981 1 1.001489
salesData2_4 <- readr::read_csv("./RInputFiles/salesDataMon2To4.csv")
## Parsed with column specification:
## cols(
## id = col_integer(),
## nItems = col_integer(),
## mostFreqStore = col_character(),
## mostFreqCat = col_character(),
## nCats = col_integer(),
## preferredBrand = col_character(),
## nBrands = col_integer(),
## nPurch = col_integer(),
## salesLast3Mon = col_double(),
## daysSinceLastPurch = col_integer(),
## meanItemPrice = col_double(),
## meanShoppingCartValue = col_double(),
## customerDuration = col_integer()
## )
# getting an overview of new data
summary(salesData2_4)
## id nItems mostFreqStore mostFreqCat
## Min. : 1 Min. : 1.0 Length:5173 Length:5173
## 1st Qu.:1372 1st Qu.: 84.0 Class :character Class :character
## Median :2733 Median : 155.0 Mode :character Mode :character
## Mean :2729 Mean : 185.9
## 3rd Qu.:4085 3rd Qu.: 257.0
## Max. :5455 Max. :1461.0
## nCats preferredBrand nBrands nPurch
## Min. : 1.00 Length:5173 Min. : 1.00 Min. : 1.00
## 1st Qu.:27.00 Class :character 1st Qu.: 45.00 1st Qu.:11.00
## Median :37.00 Mode :character Median : 75.00 Median :17.00
## Mean :36.23 Mean : 81.66 Mean :20.02
## 3rd Qu.:46.00 3rd Qu.:110.00 3rd Qu.:27.00
## Max. :74.00 Max. :484.00 Max. :86.00
## salesLast3Mon daysSinceLastPurch meanItemPrice meanShoppingCartValue
## Min. : 189 Min. : 1.000 Min. : 1.879 Min. : 17.58
## 1st Qu.:1068 1st Qu.: 2.000 1st Qu.: 6.049 1st Qu.: 53.88
## Median :1331 Median : 4.000 Median : 8.556 Median : 75.77
## Mean :1324 Mean : 6.589 Mean : 12.116 Mean : 91.88
## 3rd Qu.:1570 3rd Qu.: 7.000 3rd Qu.: 12.969 3rd Qu.: 109.74
## Max. :2745 Max. :87.000 Max. :313.050 Max. :1147.66
## customerDuration
## Min. : 31.0
## 1st Qu.: 580.0
## Median : 682.0
## Mean : 676.8
## 3rd Qu.: 777.0
## Max. :1386.0
# predicting sales
predSales5 <- predict(salesModel2, newdata = salesData2_4)
# calculating mean of future sales
mean(predSales5)
## [1] 625.1438
Chapter 2 - Logistic Regression for Churn Prevention
Churn prevention in online marketing:
Modeling and model selection:
In-sample model fit and thresholding:
Out-of-sample validation and cross validation:
Example code includes:
defaultData <- readr::read_delim("./RInputFiles/defaultData.csv", delim=";")
## Parsed with column specification:
## cols(
## .default = col_integer()
## )
## See spec(...) for full column specifications.
# Summary of data
summary(defaultData)
## ID limitBal sex education
## Min. : 1 Min. : 10000 Min. :1.000 Min. :0.000
## 1st Qu.: 4501 1st Qu.: 50000 1st Qu.:1.000 1st Qu.:1.000
## Median : 9000 Median : 130000 Median :2.000 Median :2.000
## Mean : 9000 Mean : 162902 Mean :1.588 Mean :1.835
## 3rd Qu.:13500 3rd Qu.: 230000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :18000 Max. :1000000 Max. :2.000 Max. :6.000
## marriage age pay1 pay2
## Min. :0.00 Min. :21.00 Min. :-2.00000 Min. :-2.0000
## 1st Qu.:1.00 1st Qu.:28.00 1st Qu.:-1.00000 1st Qu.:-1.0000
## Median :2.00 Median :34.00 Median : 0.00000 Median : 0.0000
## Mean :1.56 Mean :35.48 Mean : 0.02783 Mean :-0.1017
## 3rd Qu.:2.00 3rd Qu.:41.00 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. :3.00 Max. :75.00 Max. : 8.00000 Max. : 8.0000
## pay3 pay4 pay5 pay6
## Min. :-2.0000 Min. :-2.0000 Min. :-2.000 Min. :-2.0000
## 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.:-1.000 1st Qu.:-1.0000
## Median : 0.0000 Median : 0.0000 Median : 0.000 Median : 0.0000
## Mean :-0.1294 Mean :-0.1974 Mean :-0.228 Mean :-0.2567
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.: 0.0000
## Max. : 8.0000 Max. : 8.0000 Max. : 8.000 Max. : 8.0000
## billAmt1 billAmt2 billAmt3 billAmt4
## Min. :-165580 Min. :-33350 Min. : -34041 Min. :-170000
## 1st Qu.: 3675 1st Qu.: 3149 1st Qu.: 2655 1st Qu.: 2245
## Median : 22450 Median : 21425 Median : 20035 Median : 18703
## Mean : 50030 Mean : 48131 Mean : 45607 Mean : 41074
## 3rd Qu.: 65001 3rd Qu.: 62157 3rd Qu.: 58457 3rd Qu.: 50540
## Max. : 964511 Max. :983931 Max. :1664089 Max. : 891586
## billAmt5 billAmt6 payAmt1 payAmt2
## Min. :-37594 Min. :-339603 Min. : 0 Min. : 0
## 1st Qu.: 1684 1st Qu.: 1150 1st Qu.: 949 1st Qu.: 696
## Median : 18046 Median : 16780 Median : 2087 Median : 2000
## Mean : 39398 Mean : 38009 Mean : 5532 Mean : 5731
## 3rd Qu.: 49355 3rd Qu.: 48442 3rd Qu.: 5000 3rd Qu.: 5000
## Max. :927171 Max. : 961664 Max. :505000 Max. :1684259
## payAmt3 payAmt4 payAmt5 payAmt6
## Min. : 0 Min. : 0 Min. : 0.0 Min. : 0
## 1st Qu.: 307 1st Qu.: 228 1st Qu.: 209.8 1st Qu.: 2
## Median : 1500 Median : 1486 Median : 1500.0 Median : 1400
## Mean : 4629 Mean : 4757 Mean : 4763.7 Mean : 5135
## 3rd Qu.: 4000 3rd Qu.: 4000 3rd Qu.: 4000.0 3rd Qu.: 4000
## Max. :896040 Max. :497000 Max. :417990.0 Max. :528666
## PaymentDefault
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2306
## 3rd Qu.:0.0000
## Max. :1.0000
# Look at data structure
str(defaultData, give.attr=FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 18000 obs. of 25 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ limitBal : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ sex : int 2 2 2 2 1 1 1 2 2 1 ...
## $ education : int 2 2 2 2 2 1 1 2 3 3 ...
## $ marriage : int 1 2 2 1 1 2 2 2 1 2 ...
## $ age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ pay1 : int 2 -1 0 0 -1 0 0 0 0 -2 ...
## $ pay2 : int 2 2 0 0 0 0 0 -1 0 -2 ...
## $ pay3 : int -1 0 0 0 -1 0 0 -1 2 -2 ...
## $ pay4 : int -1 0 0 0 0 0 0 0 0 -2 ...
## $ pay5 : int -2 0 0 0 0 0 0 0 0 -1 ...
## $ pay6 : int -2 2 0 0 0 0 0 -1 0 -1 ...
## $ billAmt1 : int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ billAmt2 : int 3102 1725 14027 48233 5670 57069 412023 380 14096 0 ...
## $ billAmt3 : int 689 2682 13559 49291 35835 57608 445007 601 12108 0 ...
## $ billAmt4 : int 0 3272 14331 28314 20940 19394 542653 221 12211 0 ...
## $ billAmt5 : int 0 3455 14948 28959 19146 19619 483003 -159 11793 13007 ...
## $ billAmt6 : int 0 3261 15549 29547 19131 20024 473944 567 3719 13912 ...
## $ payAmt1 : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ payAmt2 : int 689 1000 1500 2019 36681 1815 40000 601 0 0 ...
## $ payAmt3 : int 0 1000 1000 1200 10000 657 38000 0 432 0 ...
## $ payAmt4 : int 0 1000 1000 1100 9000 1000 20239 581 1000 13007 ...
## $ payAmt5 : int 0 0 1000 1069 689 1000 13750 1687 1000 1122 ...
## $ payAmt6 : int 0 2000 5000 1000 679 800 13770 1542 1000 0 ...
## $ PaymentDefault: int 1 1 0 0 0 0 0 0 0 0 ...
# Analyze the balancedness of dependent variable
ggplot(defaultData, aes(x = PaymentDefault)) +
geom_histogram(stat = "count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
# Build logistic regression model
logitModelFull <- glm(PaymentDefault ~ limitBal + sex + education + marriage +
age + pay1 + pay2 + pay3 + pay4 + pay5 + pay6 + billAmt1 +
billAmt2 + billAmt3 + billAmt4 + billAmt5 + billAmt6 + payAmt1 +
payAmt2 + payAmt3 + payAmt4 + payAmt5 + payAmt6,
family = "binomial", data = defaultData)
# Take a look at the model
summary(logitModelFull)
##
## Call:
## glm(formula = PaymentDefault ~ limitBal + sex + education + marriage +
## age + pay1 + pay2 + pay3 + pay4 + pay5 + pay6 + billAmt1 +
## billAmt2 + billAmt3 + billAmt4 + billAmt5 + billAmt6 + payAmt1 +
## payAmt2 + payAmt3 + payAmt4 + payAmt5 + payAmt6, family = "binomial",
## data = defaultData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0893 -0.7116 -0.5615 -0.2794 4.2501
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.711e-01 1.505e-01 -3.795 0.000148 ***
## limitBal -4.825e-07 1.985e-07 -2.431 0.015052 *
## sex -8.251e-02 3.880e-02 -2.127 0.033457 *
## education -1.217e-01 2.745e-02 -4.434 9.23e-06 ***
## marriage -1.711e-01 4.016e-02 -4.259 2.05e-05 ***
## age 4.824e-03 2.257e-03 2.137 0.032570 *
## pay1 5.743e-01 2.221e-02 25.864 < 2e-16 ***
## pay2 5.156e-02 2.552e-02 2.020 0.043336 *
## pay3 7.811e-02 2.863e-02 2.728 0.006375 **
## pay4 -1.191e-02 3.285e-02 -0.363 0.716838
## pay5 1.080e-01 3.381e-02 3.193 0.001406 **
## pay6 -1.956e-02 2.750e-02 -0.711 0.476852
## billAmt1 -7.948e-06 1.582e-06 -5.023 5.09e-07 ***
## billAmt2 4.911e-06 2.006e-06 2.448 0.014350 *
## billAmt3 4.203e-07 1.698e-06 0.247 0.804572
## billAmt4 -1.587e-08 1.872e-06 -0.008 0.993234
## billAmt5 9.703e-07 2.154e-06 0.451 0.652293
## billAmt6 6.758e-07 1.591e-06 0.425 0.670955
## payAmt1 -1.878e-05 3.252e-06 -5.777 7.61e-09 ***
## payAmt2 -6.406e-06 2.364e-06 -2.710 0.006731 **
## payAmt3 -3.325e-06 2.401e-06 -1.385 0.166153
## payAmt4 -3.922e-06 2.342e-06 -1.675 0.093970 .
## payAmt5 -2.383e-06 2.168e-06 -1.099 0.271635
## payAmt6 -1.916e-06 1.618e-06 -1.184 0.236521
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19438 on 17999 degrees of freedom
## Residual deviance: 17216 on 17976 degrees of freedom
## AIC: 17264
##
## Number of Fisher Scoring iterations: 5
# Take a look at the odds
coefsexp <- coef(logitModelFull) %>% exp() %>% round(2)
coefsexp
## (Intercept) limitBal sex education marriage age
## 0.56 1.00 0.92 0.89 0.84 1.00
## pay1 pay2 pay3 pay4 pay5 pay6
## 1.78 1.05 1.08 0.99 1.11 0.98
## billAmt1 billAmt2 billAmt3 billAmt4 billAmt5 billAmt6
## 1.00 1.00 1.00 1.00 1.00 1.00
## payAmt1 payAmt2 payAmt3 payAmt4 payAmt5 payAmt6
## 1.00 1.00 1.00 1.00 1.00 1.00
# The old (full) model
logitModelFull <- glm(PaymentDefault ~ limitBal + sex + education + marriage +
age + pay1 + pay2 + pay3 + pay4 + pay5 + pay6 + billAmt1 +
billAmt2 + billAmt3 + billAmt4 + billAmt5 + billAmt6 + payAmt1 +
payAmt2 + payAmt3 + payAmt4 + payAmt5 + payAmt6,
family = binomial, defaultData)
#Build the new model
logitModelNew <- MASS::stepAIC(logitModelFull, trace=0)
#Look at the model
summary(logitModelNew)
##
## Call:
## glm(formula = PaymentDefault ~ limitBal + sex + education + marriage +
## age + pay1 + pay2 + pay3 + pay5 + billAmt1 + billAmt2 + billAmt5 +
## payAmt1 + payAmt2 + payAmt3 + payAmt4, family = binomial,
## data = defaultData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0839 -0.7119 -0.5611 -0.2839 4.1800
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.699e-01 1.504e-01 -3.790 0.000151 ***
## limitBal -5.201e-07 1.954e-07 -2.661 0.007791 **
## sex -8.206e-02 3.878e-02 -2.116 0.034338 *
## education -1.212e-01 2.744e-02 -4.418 9.96e-06 ***
## marriage -1.724e-01 4.014e-02 -4.295 1.75e-05 ***
## age 4.863e-03 2.256e-03 2.156 0.031092 *
## pay1 5.740e-01 2.218e-02 25.882 < 2e-16 ***
## pay2 4.979e-02 2.552e-02 1.951 0.051048 .
## pay3 7.197e-02 2.573e-02 2.798 0.005146 **
## pay5 8.859e-02 2.249e-02 3.938 8.20e-05 ***
## billAmt1 -8.130e-06 1.580e-06 -5.144 2.69e-07 ***
## billAmt2 5.238e-06 1.775e-06 2.951 0.003165 **
## billAmt5 1.790e-06 8.782e-07 2.038 0.041554 *
## payAmt1 -1.931e-05 3.258e-06 -5.928 3.06e-09 ***
## payAmt2 -6.572e-06 2.092e-06 -3.142 0.001681 **
## payAmt3 -3.693e-06 2.187e-06 -1.689 0.091241 .
## payAmt4 -4.611e-06 2.062e-06 -2.237 0.025306 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19438 on 17999 degrees of freedom
## Residual deviance: 17220 on 17983 degrees of freedom
## AIC: 17254
##
## Number of Fisher Scoring iterations: 5
# Save the formula of the new model (it will be needed for the out-of-sample part)
formulaLogit <- as.formula(summary(logitModelNew)$call)
formulaLogit
## PaymentDefault ~ limitBal + sex + education + marriage + age +
## pay1 + pay2 + pay3 + pay5 + billAmt1 + billAmt2 + billAmt5 +
## payAmt1 + payAmt2 + payAmt3 + payAmt4
# Make predictions using the full Model
defaultData$predFull <- predict(logitModelFull, type = "response", na.action = na.exclude)
# Construct the in-sample confusion matrix
confMatrixModelFull <- SDMTools::confusion.matrix(defaultData$PaymentDefault,
defaultData$predFull,
threshold = 0.5
)
confMatrixModelFull
## obs
## pred 0 1
## 0 13441 3154
## 1 409 996
## attr(,"class")
## [1] "confusion.matrix"
# Calculate the accuracy for the full Model
accuracyFull <- sum(diag(confMatrixModelFull)) / sum(confMatrixModelFull)
accuracyFull
## [1] 0.8020556
# Calculate the accuracy for 'logitModelNew'
# Make prediction
defaultData$predNew <- predict(logitModelNew, type = "response", na.action = na.exclude)
# Construct the in-sample confusion matrix
confMatrixModelNew <- SDMTools::confusion.matrix(defaultData$PaymentDefault,
defaultData$predNew,
threshold = 0.5
)
confMatrixModelNew
## obs
## pred 0 1
## 0 13443 3152
## 1 407 998
## attr(,"class")
## [1] "confusion.matrix"
# Calculate the accuracy...
accuracyNew <- sum(diag(confMatrixModelNew)) / sum(confMatrixModelNew)
accuracyNew
## [1] 0.8022778
# and compare it to the full model's accuracy
accuracyFull
## [1] 0.8020556
accuracyNew
## [1] 0.8022778
# Prepare data frame with threshold values and empty payoff column
payoffMatrix <- data.frame(threshold = seq(from = 0.1, to = 0.5, by = 0.1), payoff = NA)
payoffMatrix
## threshold payoff
## 1 0.1 NA
## 2 0.2 NA
## 3 0.3 NA
## 4 0.4 NA
## 5 0.5 NA
for(i in 1:length(payoffMatrix$threshold)) {
# Calculate confusion matrix with varying threshold
confMatrix <- SDMTools::confusion.matrix(defaultData$PaymentDefault,
defaultData$predNew,
threshold = payoffMatrix$threshold[i]
)
# Calculate payoff and save it to the corresponding row
payoffMatrix$payoff[i] <- confMatrix[1, 1]*250 + confMatrix[1, 2]*(-1000)
}
payoffMatrix
## threshold payoff
## 1 0.1 306750
## 2 0.2 752750
## 3 0.3 888000
## 4 0.4 641250
## 5 0.5 208750
# Split data in train and test set
set.seed(534381)
defaultData$isTrain <- rbinom(nrow(defaultData), 1, 0.66)
train <- subset(defaultData, isTrain == 1)
test <- subset(defaultData, isTrain == 0)
logitTrainNew <- glm(formulaLogit, family = binomial, data = train) # Modeling
test$predNew <- predict(logitTrainNew, type = "response", newdata = test) # Predictions
# Out-of-sample confusion matrix and accuracy
confMatrixModelNew <- SDMTools::confusion.matrix(test$PaymentDefault, test$predNew, threshold = 0.3)
sum(diag(confMatrixModelNew)) / sum(confMatrixModelNew) # Compare this value to the in-sample accuracy
## [1] 0.7797764
# Accuracy function
costAcc <- function(r, pi = 0) {
cm <- SDMTools::confusion.matrix(r, pi, threshold = 0.3)
acc <- sum(diag(cm)) / sum(cm)
return(acc)
}
# Cross validated accuracy for logitModelNew
set.seed(534381)
boot::cv.glm(defaultData, logitModelNew, cost = costAcc, K = 6)$delta[1]
## [1] 0.7862778
Chapter 3 - Modeling Time to Reorder with Survival Analysis
Survival Analysis Introduction:
Survival curve analysis by Kaplan-Meier:
Cox PH model with constant covariates:
Checking model assumptions and making predictions:
Example code includes:
survData <- readr::read_delim("./RInputFiles/survivalDataExercise.csv", delim=",")
## Parsed with column specification:
## cols(
## daysSinceFirstPurch = col_integer(),
## shoppingCartValue = col_double(),
## gender = col_character(),
## voucher = col_integer(),
## returned = col_integer(),
## boughtAgain = col_integer()
## )
dataNextOrder <- survData %>%
select(daysSinceFirstPurch, boughtAgain)
# Look at the head of the data
head(dataNextOrder)
## # A tibble: 6 x 2
## daysSinceFirstPurch boughtAgain
## <int> <int>
## 1 37 0
## 2 63 1
## 3 48 0
## 4 17 1
## 5 53 0
## 6 11 1
# Plot a histogram
ggplot(dataNextOrder) +
geom_histogram(aes(x = daysSinceFirstPurch, fill = factor(boughtAgain))) +
facet_grid( ~ boughtAgain) + # Separate plots for boughtAgain = 1 vs. 0
theme(legend.position = "none") # Don't show legend
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Create survival object
survObj <- survival::Surv(dataNextOrder$daysSinceFirstPurch, dataNextOrder$boughtAgain)
# Look at structure
str(survObj)
## 'Surv' num [1:5122, 1:2] 37+ 63 48+ 17 53+ 11 22 16 74+ 44 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:2] "time" "status"
## - attr(*, "type")= chr "right"
# Compute and print fit
fitKMSimple <- survival::survfit(survObj ~ 1)
print(fitKMSimple)
## Call: survfit(formula = survObj ~ 1)
##
## n events median 0.95LCL 0.95UCL
## 5122 3199 41 40 42
# Plot fit
plot(fitKMSimple, conf.int = FALSE, xlab = "Time since first purchase",
ylab = "Survival function", main = "Survival function"
)
dataNextOrder <- survData %>%
select(daysSinceFirstPurch, boughtAgain, voucher)
# Compute fit with categorical covariate
fitKMCov <- survival::survfit(survObj ~ voucher, data = dataNextOrder)
# Plot fit with covariate and add labels
plot(fitKMCov, lty = 2:3, xlab = "Time since first purchase",
ylab = "Survival function", main = "Survival function"
)
legend(90, .9, c("No", "Yes"), lty = 2:3)
dataNextOrder <- survData
# Determine distributions of predictor variables
dd <- rms::datadist(dataNextOrder)
options(datadist = "dd")
# Compute Cox PH Model and print results
fitCPH <- rms::cph(survival::Surv(daysSinceFirstPurch, boughtAgain) ~
shoppingCartValue + voucher + returned + gender, data = dataNextOrder,
x = TRUE, y = TRUE, surv = TRUE
)
print(fitCPH)
## Cox Proportional Hazards Model
##
## rms::cph(formula = survival::Surv(daysSinceFirstPurch, boughtAgain) ~
## shoppingCartValue + voucher + returned + gender, data = dataNextOrder,
## x = TRUE, y = TRUE, surv = TRUE)
##
## Model Tests Discrimination
## Indexes
## Obs 5122 LR chi2 155.68 R2 0.030
## Events 3199 d.f. 4 Dxy 0.116
## Center -0.2808 Pr(> chi2) 0.0000 g 0.238
## Score chi2 140.57 gr 1.269
## Pr(> chi2) 0.0000
##
## Coef S.E. Wald Z Pr(>|Z|)
## shoppingCartValue -0.0021 0.0003 -7.56 <0.0001
## voucher -0.2945 0.0480 -6.14 <0.0001
## returned -0.3145 0.0495 -6.36 <0.0001
## gender=male 0.1080 0.0363 2.97 0.0029
##
# Interpret coefficients
exp(fitCPH$coefficients)
## shoppingCartValue voucher returned gender=male
## 0.9978601 0.7449362 0.7301667 1.1140891
# Plot result summary
plot(summary(fitCPH), log = TRUE)
# Check proportional hazard assumption and print result
testCPH <- survival::cox.zph(fitCPH)
print(testCPH)
## rho chisq p
## shoppingCartValue -0.0168 0.907 0.3409
## voucher -0.0155 0.770 0.3803
## returned 0.0261 2.182 0.1397
## gender=male 0.0390 4.922 0.0265
## GLOBAL NA 8.528 0.0740
# Plot time-dependent beta
plot(testCPH, var = "gender=male")
# Validate model
rms::validate(fitCPH, method = "crossvalidation", B = 10, dxy = TRUE, pr = FALSE)
## index.orig training test optimism index.corrected n
## Dxy 0.1159 0.1160 0.1145 0.0014 0.1144 10
## R2 0.0299 0.0300 0.0288 0.0013 0.0287 10
## Slope 1.0000 1.0000 0.9733 0.0267 0.9733 10
## D 0.0032 0.0033 0.0042 -0.0009 0.0041 10
## U 0.0000 0.0000 0.0002 -0.0002 0.0002 10
## Q 0.0032 0.0033 0.0040 -0.0007 0.0040 10
## g 0.2380 0.2382 0.2320 0.0062 0.2318 10
# Create data with new customer
newCustomer <- data.frame(daysSinceFirstPurch = 21, shoppingCartValue = 99.9, gender = "female",
voucher = 1, returned = 0, stringsAsFactors = FALSE
)
# Make predictions
pred <- survival::survfit(fitCPH, newdata = newCustomer)
print(pred)
## Call: survfit(formula = fitCPH, newdata = newCustomer)
##
## n events median 0.95LCL 0.95UCL
## 5122 3199 47 44 49
plot(pred)
# Correct the customer's gender
newCustomer2 <- newCustomer
newCustomer2$gender <- "male"
# Redo prediction
pred2 <- survival::survfit(fitCPH, newdata = newCustomer2)
print(pred2)
## Call: survfit(formula = fitCPH, newdata = newCustomer2)
##
## n events median 0.95LCL 0.95UCL
## 5122 3199 44 42 47
Chapter 4 - Reducing Dimensionality with Principal Component Analysis
PCA for CRM Data - address mutlicollinearity and data volume issues in the raw CRM data:
PCA Computation:
PCA Model Specification:
Principal components in a regression analysis:
Wrap up:
Example code includes:
load("./RInputFiles/newsData.RData")
rawData <- newsData
newsData <- newsData[, c('n_tokens_title', 'n_tokens_content', 'n_unique_tokens', 'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos', 'num_keywords', 'is_weekend', 'kw_avg_min', 'kw_avg_avg', 'kw_avg_max', 'average_token_length', 'global_subjectivity', 'global_sentiment_polarity', 'global_rate_positive_words', 'global_rate_negative_words', 'avg_positive_polarity', 'avg_negative_polarity', 'title_subjectivity', 'title_sentiment_polarity')]
# Overview of data structure:
str(newsData, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 39644 obs. of 21 variables:
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words: num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words: num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
# Correlation structure:
newsData %>% cor() %>% corrplot::corrplot()
# Standardize data
newsData <- newsData %>% scale() %>% as.data.frame()
# Compute PCA
pcaNews <- newsData %>% prcomp()
# Eigenvalues
pcaNews$sdev**2
## [1] 3.31015107 2.00241491 1.82662819 1.67421238 1.30249854 1.20443028
## [7] 1.02889482 1.00052438 0.97929267 0.95905061 0.82676492 0.74951891
## [13] 0.73162009 0.66351863 0.62319656 0.57949073 0.47020594 0.41516936
## [19] 0.29926492 0.27690363 0.07624847
# Screeplot:
screeplot(pcaNews, type = "lines")
# Cumulative explained variance:
summary(pcaNews)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.8194 1.41507 1.35153 1.29391 1.14127 1.09747
## Proportion of Variance 0.1576 0.09535 0.08698 0.07972 0.06202 0.05735
## Cumulative Proportion 0.1576 0.25298 0.33996 0.41969 0.48171 0.53906
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 1.01434 1.00026 0.98959 0.97931 0.90927 0.86575
## Proportion of Variance 0.04899 0.04764 0.04663 0.04567 0.03937 0.03569
## Cumulative Proportion 0.58806 0.63570 0.68234 0.72800 0.76737 0.80307
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.85535 0.8146 0.78943 0.76124 0.68572 0.64434
## Proportion of Variance 0.03484 0.0316 0.02968 0.02759 0.02239 0.01977
## Cumulative Proportion 0.83790 0.8695 0.89918 0.92677 0.94916 0.96893
## PC19 PC20 PC21
## Standard deviation 0.54705 0.52622 0.27613
## Proportion of Variance 0.01425 0.01319 0.00363
## Cumulative Proportion 0.98318 0.99637 1.00000
# Kaiser-Guttmann (number of components with eigenvalue larger than 1):
sum(pcaNews$sdev > 1)
## [1] 8
# Print loadings of the first six components
pcaNews$rotation[, 1:6] %>% round(2)
## PC1 PC2 PC3 PC4 PC5 PC6
## n_tokens_title -0.05 -0.10 0.01 -0.10 0.20 -0.28
## n_tokens_content 0.23 -0.17 -0.38 0.12 0.15 -0.02
## n_unique_tokens 0.00 0.00 0.00 0.01 0.01 0.06
## num_hrefs 0.26 -0.16 -0.42 -0.03 0.07 0.11
## num_self_hrefs 0.20 -0.07 -0.39 0.06 0.12 0.08
## num_imgs 0.14 -0.15 -0.43 -0.06 0.04 0.08
## num_videos 0.09 -0.20 0.04 -0.19 0.16 -0.14
## num_keywords 0.07 0.11 -0.25 0.14 -0.42 -0.30
## is_weekend 0.05 -0.01 -0.12 -0.02 -0.10 -0.16
## kw_avg_min 0.03 0.01 -0.05 -0.25 -0.65 0.07
## kw_avg_avg 0.02 -0.15 -0.06 -0.61 -0.31 0.17
## kw_avg_max -0.10 -0.21 0.10 -0.50 0.35 0.26
## average_token_length 0.39 -0.02 0.19 0.19 -0.01 0.14
## global_subjectivity 0.45 -0.01 0.23 -0.04 -0.03 0.03
## global_sentiment_polarity 0.25 0.55 -0.03 -0.19 0.11 0.13
## global_rate_positive_words 0.33 0.25 0.14 -0.08 0.04 -0.09
## global_rate_negative_words 0.15 -0.47 0.23 0.11 -0.10 -0.21
## avg_positive_polarity 0.42 0.09 0.17 -0.06 0.02 0.10
## avg_negative_polarity -0.25 0.37 -0.20 -0.04 0.08 0.06
## title_subjectivity 0.07 -0.03 0.01 -0.27 0.07 -0.61
## title_sentiment_polarity 0.07 0.24 -0.11 -0.24 0.15 -0.42
pcaNews %>% biplot(choices=1:2, cex = 0.5)
# Predict log shares with all original variables
logShares <- rawData %>%
select(shares) %>%
mutate(logShares=log(1+shares)) %>%
pull(logShares) %>%
scale()
newsData <- newsData %>%
cbind(logShares)
mod1 <- lm(logShares ~ ., data = newsData)
# Create dataframe with log shares and first 6 components
dataNewsComponents <- cbind(logShares = newsData[, "logShares"], pcaNews$x[, 1:6]) %>%
as.data.frame()
# Predict log shares with first six components
mod2 <- lm(logShares ~ ., data = dataNewsComponents)
# Print adjusted R squared for both models
summary(mod1)$adj.r.squared
## [1] 0.07954578
summary(mod2)$adj.r.squared
## [1] 0.05066316
Chapter 1 - Setting Up Interactive Web Maps
Introduction to leaflet - open-source JavaScript library that makes interactive, mobile-friendly maps:
Map tiles - over 100 pre-canned maps that are available as bases:
Setting the default map view:
Plotting DataCamp HQ:
Example code includes:
# Load the leaflet library
library(leaflet)
##
## Attaching package: 'leaflet'
## The following object is masked from 'package:xts':
##
## addLegend
# Create a leaflet map with default map tile using addTiles()
leaflet() %>%
addTiles()
# Print the providers list included in the leaflet library
providers
## $OpenStreetMap
## [1] "OpenStreetMap"
##
## $OpenStreetMap.Mapnik
## [1] "OpenStreetMap.Mapnik"
##
## $OpenStreetMap.BlackAndWhite
## [1] "OpenStreetMap.BlackAndWhite"
##
## $OpenStreetMap.DE
## [1] "OpenStreetMap.DE"
##
## $OpenStreetMap.CH
## [1] "OpenStreetMap.CH"
##
## $OpenStreetMap.France
## [1] "OpenStreetMap.France"
##
## $OpenStreetMap.HOT
## [1] "OpenStreetMap.HOT"
##
## $OpenStreetMap.BZH
## [1] "OpenStreetMap.BZH"
##
## $OpenInfraMap
## [1] "OpenInfraMap"
##
## $OpenInfraMap.Power
## [1] "OpenInfraMap.Power"
##
## $OpenInfraMap.Telecom
## [1] "OpenInfraMap.Telecom"
##
## $OpenInfraMap.Petroleum
## [1] "OpenInfraMap.Petroleum"
##
## $OpenInfraMap.Water
## [1] "OpenInfraMap.Water"
##
## $OpenSeaMap
## [1] "OpenSeaMap"
##
## $OpenPtMap
## [1] "OpenPtMap"
##
## $OpenTopoMap
## [1] "OpenTopoMap"
##
## $OpenRailwayMap
## [1] "OpenRailwayMap"
##
## $OpenFireMap
## [1] "OpenFireMap"
##
## $SafeCast
## [1] "SafeCast"
##
## $Thunderforest
## [1] "Thunderforest"
##
## $Thunderforest.OpenCycleMap
## [1] "Thunderforest.OpenCycleMap"
##
## $Thunderforest.Transport
## [1] "Thunderforest.Transport"
##
## $Thunderforest.TransportDark
## [1] "Thunderforest.TransportDark"
##
## $Thunderforest.SpinalMap
## [1] "Thunderforest.SpinalMap"
##
## $Thunderforest.Landscape
## [1] "Thunderforest.Landscape"
##
## $Thunderforest.Outdoors
## [1] "Thunderforest.Outdoors"
##
## $Thunderforest.Pioneer
## [1] "Thunderforest.Pioneer"
##
## $OpenMapSurfer
## [1] "OpenMapSurfer"
##
## $OpenMapSurfer.Roads
## [1] "OpenMapSurfer.Roads"
##
## $OpenMapSurfer.AdminBounds
## [1] "OpenMapSurfer.AdminBounds"
##
## $OpenMapSurfer.Grayscale
## [1] "OpenMapSurfer.Grayscale"
##
## $Hydda
## [1] "Hydda"
##
## $Hydda.Full
## [1] "Hydda.Full"
##
## $Hydda.Base
## [1] "Hydda.Base"
##
## $Hydda.RoadsAndLabels
## [1] "Hydda.RoadsAndLabels"
##
## $MapBox
## [1] "MapBox"
##
## $Stamen
## [1] "Stamen"
##
## $Stamen.Toner
## [1] "Stamen.Toner"
##
## $Stamen.TonerBackground
## [1] "Stamen.TonerBackground"
##
## $Stamen.TonerHybrid
## [1] "Stamen.TonerHybrid"
##
## $Stamen.TonerLines
## [1] "Stamen.TonerLines"
##
## $Stamen.TonerLabels
## [1] "Stamen.TonerLabels"
##
## $Stamen.TonerLite
## [1] "Stamen.TonerLite"
##
## $Stamen.Watercolor
## [1] "Stamen.Watercolor"
##
## $Stamen.Terrain
## [1] "Stamen.Terrain"
##
## $Stamen.TerrainBackground
## [1] "Stamen.TerrainBackground"
##
## $Stamen.TopOSMRelief
## [1] "Stamen.TopOSMRelief"
##
## $Stamen.TopOSMFeatures
## [1] "Stamen.TopOSMFeatures"
##
## $Esri
## [1] "Esri"
##
## $Esri.WorldStreetMap
## [1] "Esri.WorldStreetMap"
##
## $Esri.DeLorme
## [1] "Esri.DeLorme"
##
## $Esri.WorldTopoMap
## [1] "Esri.WorldTopoMap"
##
## $Esri.WorldImagery
## [1] "Esri.WorldImagery"
##
## $Esri.WorldTerrain
## [1] "Esri.WorldTerrain"
##
## $Esri.WorldShadedRelief
## [1] "Esri.WorldShadedRelief"
##
## $Esri.WorldPhysical
## [1] "Esri.WorldPhysical"
##
## $Esri.OceanBasemap
## [1] "Esri.OceanBasemap"
##
## $Esri.NatGeoWorldMap
## [1] "Esri.NatGeoWorldMap"
##
## $Esri.WorldGrayCanvas
## [1] "Esri.WorldGrayCanvas"
##
## $OpenWeatherMap
## [1] "OpenWeatherMap"
##
## $OpenWeatherMap.Clouds
## [1] "OpenWeatherMap.Clouds"
##
## $OpenWeatherMap.CloudsClassic
## [1] "OpenWeatherMap.CloudsClassic"
##
## $OpenWeatherMap.Precipitation
## [1] "OpenWeatherMap.Precipitation"
##
## $OpenWeatherMap.PrecipitationClassic
## [1] "OpenWeatherMap.PrecipitationClassic"
##
## $OpenWeatherMap.Rain
## [1] "OpenWeatherMap.Rain"
##
## $OpenWeatherMap.RainClassic
## [1] "OpenWeatherMap.RainClassic"
##
## $OpenWeatherMap.Pressure
## [1] "OpenWeatherMap.Pressure"
##
## $OpenWeatherMap.PressureContour
## [1] "OpenWeatherMap.PressureContour"
##
## $OpenWeatherMap.Wind
## [1] "OpenWeatherMap.Wind"
##
## $OpenWeatherMap.Temperature
## [1] "OpenWeatherMap.Temperature"
##
## $OpenWeatherMap.Snow
## [1] "OpenWeatherMap.Snow"
##
## $HERE
## [1] "HERE"
##
## $HERE.normalDay
## [1] "HERE.normalDay"
##
## $HERE.normalDayCustom
## [1] "HERE.normalDayCustom"
##
## $HERE.normalDayGrey
## [1] "HERE.normalDayGrey"
##
## $HERE.normalDayMobile
## [1] "HERE.normalDayMobile"
##
## $HERE.normalDayGreyMobile
## [1] "HERE.normalDayGreyMobile"
##
## $HERE.normalDayTransit
## [1] "HERE.normalDayTransit"
##
## $HERE.normalDayTransitMobile
## [1] "HERE.normalDayTransitMobile"
##
## $HERE.normalNight
## [1] "HERE.normalNight"
##
## $HERE.normalNightMobile
## [1] "HERE.normalNightMobile"
##
## $HERE.normalNightGrey
## [1] "HERE.normalNightGrey"
##
## $HERE.normalNightGreyMobile
## [1] "HERE.normalNightGreyMobile"
##
## $HERE.basicMap
## [1] "HERE.basicMap"
##
## $HERE.mapLabels
## [1] "HERE.mapLabels"
##
## $HERE.trafficFlow
## [1] "HERE.trafficFlow"
##
## $HERE.carnavDayGrey
## [1] "HERE.carnavDayGrey"
##
## $HERE.hybridDay
## [1] "HERE.hybridDay"
##
## $HERE.hybridDayMobile
## [1] "HERE.hybridDayMobile"
##
## $HERE.pedestrianDay
## [1] "HERE.pedestrianDay"
##
## $HERE.pedestrianNight
## [1] "HERE.pedestrianNight"
##
## $HERE.satelliteDay
## [1] "HERE.satelliteDay"
##
## $HERE.terrainDay
## [1] "HERE.terrainDay"
##
## $HERE.terrainDayMobile
## [1] "HERE.terrainDayMobile"
##
## $FreeMapSK
## [1] "FreeMapSK"
##
## $MtbMap
## [1] "MtbMap"
##
## $CartoDB
## [1] "CartoDB"
##
## $CartoDB.Positron
## [1] "CartoDB.Positron"
##
## $CartoDB.PositronNoLabels
## [1] "CartoDB.PositronNoLabels"
##
## $CartoDB.PositronOnlyLabels
## [1] "CartoDB.PositronOnlyLabels"
##
## $CartoDB.DarkMatter
## [1] "CartoDB.DarkMatter"
##
## $CartoDB.DarkMatterNoLabels
## [1] "CartoDB.DarkMatterNoLabels"
##
## $CartoDB.DarkMatterOnlyLabels
## [1] "CartoDB.DarkMatterOnlyLabels"
##
## $HikeBike
## [1] "HikeBike"
##
## $HikeBike.HikeBike
## [1] "HikeBike.HikeBike"
##
## $HikeBike.HillShading
## [1] "HikeBike.HillShading"
##
## $BasemapAT
## [1] "BasemapAT"
##
## $BasemapAT.basemap
## [1] "BasemapAT.basemap"
##
## $BasemapAT.grau
## [1] "BasemapAT.grau"
##
## $BasemapAT.overlay
## [1] "BasemapAT.overlay"
##
## $BasemapAT.highdpi
## [1] "BasemapAT.highdpi"
##
## $BasemapAT.orthofoto
## [1] "BasemapAT.orthofoto"
##
## $nlmaps
## [1] "nlmaps"
##
## $nlmaps.standaard
## [1] "nlmaps.standaard"
##
## $nlmaps.pastel
## [1] "nlmaps.pastel"
##
## $nlmaps.grijs
## [1] "nlmaps.grijs"
##
## $nlmaps.luchtfoto
## [1] "nlmaps.luchtfoto"
##
## $NASAGIBS
## [1] "NASAGIBS"
##
## $NASAGIBS.ModisTerraTrueColorCR
## [1] "NASAGIBS.ModisTerraTrueColorCR"
##
## $NASAGIBS.ModisTerraBands367CR
## [1] "NASAGIBS.ModisTerraBands367CR"
##
## $NASAGIBS.ViirsEarthAtNight2012
## [1] "NASAGIBS.ViirsEarthAtNight2012"
##
## $NASAGIBS.ModisTerraLSTDay
## [1] "NASAGIBS.ModisTerraLSTDay"
##
## $NASAGIBS.ModisTerraSnowCover
## [1] "NASAGIBS.ModisTerraSnowCover"
##
## $NASAGIBS.ModisTerraAOD
## [1] "NASAGIBS.ModisTerraAOD"
##
## $NASAGIBS.ModisTerraChlorophyll
## [1] "NASAGIBS.ModisTerraChlorophyll"
##
## $NLS
## [1] "NLS"
##
## $JusticeMap
## [1] "JusticeMap"
##
## $JusticeMap.income
## [1] "JusticeMap.income"
##
## $JusticeMap.americanIndian
## [1] "JusticeMap.americanIndian"
##
## $JusticeMap.asian
## [1] "JusticeMap.asian"
##
## $JusticeMap.black
## [1] "JusticeMap.black"
##
## $JusticeMap.hispanic
## [1] "JusticeMap.hispanic"
##
## $JusticeMap.multi
## [1] "JusticeMap.multi"
##
## $JusticeMap.nonWhite
## [1] "JusticeMap.nonWhite"
##
## $JusticeMap.white
## [1] "JusticeMap.white"
##
## $JusticeMap.plurality
## [1] "JusticeMap.plurality"
##
## $Wikimedia
## [1] "Wikimedia"
# Print only the names of the map tiles in the providers list
names(providers)
## [1] "OpenStreetMap"
## [2] "OpenStreetMap.Mapnik"
## [3] "OpenStreetMap.BlackAndWhite"
## [4] "OpenStreetMap.DE"
## [5] "OpenStreetMap.CH"
## [6] "OpenStreetMap.France"
## [7] "OpenStreetMap.HOT"
## [8] "OpenStreetMap.BZH"
## [9] "OpenInfraMap"
## [10] "OpenInfraMap.Power"
## [11] "OpenInfraMap.Telecom"
## [12] "OpenInfraMap.Petroleum"
## [13] "OpenInfraMap.Water"
## [14] "OpenSeaMap"
## [15] "OpenPtMap"
## [16] "OpenTopoMap"
## [17] "OpenRailwayMap"
## [18] "OpenFireMap"
## [19] "SafeCast"
## [20] "Thunderforest"
## [21] "Thunderforest.OpenCycleMap"
## [22] "Thunderforest.Transport"
## [23] "Thunderforest.TransportDark"
## [24] "Thunderforest.SpinalMap"
## [25] "Thunderforest.Landscape"
## [26] "Thunderforest.Outdoors"
## [27] "Thunderforest.Pioneer"
## [28] "OpenMapSurfer"
## [29] "OpenMapSurfer.Roads"
## [30] "OpenMapSurfer.AdminBounds"
## [31] "OpenMapSurfer.Grayscale"
## [32] "Hydda"
## [33] "Hydda.Full"
## [34] "Hydda.Base"
## [35] "Hydda.RoadsAndLabels"
## [36] "MapBox"
## [37] "Stamen"
## [38] "Stamen.Toner"
## [39] "Stamen.TonerBackground"
## [40] "Stamen.TonerHybrid"
## [41] "Stamen.TonerLines"
## [42] "Stamen.TonerLabels"
## [43] "Stamen.TonerLite"
## [44] "Stamen.Watercolor"
## [45] "Stamen.Terrain"
## [46] "Stamen.TerrainBackground"
## [47] "Stamen.TopOSMRelief"
## [48] "Stamen.TopOSMFeatures"
## [49] "Esri"
## [50] "Esri.WorldStreetMap"
## [51] "Esri.DeLorme"
## [52] "Esri.WorldTopoMap"
## [53] "Esri.WorldImagery"
## [54] "Esri.WorldTerrain"
## [55] "Esri.WorldShadedRelief"
## [56] "Esri.WorldPhysical"
## [57] "Esri.OceanBasemap"
## [58] "Esri.NatGeoWorldMap"
## [59] "Esri.WorldGrayCanvas"
## [60] "OpenWeatherMap"
## [61] "OpenWeatherMap.Clouds"
## [62] "OpenWeatherMap.CloudsClassic"
## [63] "OpenWeatherMap.Precipitation"
## [64] "OpenWeatherMap.PrecipitationClassic"
## [65] "OpenWeatherMap.Rain"
## [66] "OpenWeatherMap.RainClassic"
## [67] "OpenWeatherMap.Pressure"
## [68] "OpenWeatherMap.PressureContour"
## [69] "OpenWeatherMap.Wind"
## [70] "OpenWeatherMap.Temperature"
## [71] "OpenWeatherMap.Snow"
## [72] "HERE"
## [73] "HERE.normalDay"
## [74] "HERE.normalDayCustom"
## [75] "HERE.normalDayGrey"
## [76] "HERE.normalDayMobile"
## [77] "HERE.normalDayGreyMobile"
## [78] "HERE.normalDayTransit"
## [79] "HERE.normalDayTransitMobile"
## [80] "HERE.normalNight"
## [81] "HERE.normalNightMobile"
## [82] "HERE.normalNightGrey"
## [83] "HERE.normalNightGreyMobile"
## [84] "HERE.basicMap"
## [85] "HERE.mapLabels"
## [86] "HERE.trafficFlow"
## [87] "HERE.carnavDayGrey"
## [88] "HERE.hybridDay"
## [89] "HERE.hybridDayMobile"
## [90] "HERE.pedestrianDay"
## [91] "HERE.pedestrianNight"
## [92] "HERE.satelliteDay"
## [93] "HERE.terrainDay"
## [94] "HERE.terrainDayMobile"
## [95] "FreeMapSK"
## [96] "MtbMap"
## [97] "CartoDB"
## [98] "CartoDB.Positron"
## [99] "CartoDB.PositronNoLabels"
## [100] "CartoDB.PositronOnlyLabels"
## [101] "CartoDB.DarkMatter"
## [102] "CartoDB.DarkMatterNoLabels"
## [103] "CartoDB.DarkMatterOnlyLabels"
## [104] "HikeBike"
## [105] "HikeBike.HikeBike"
## [106] "HikeBike.HillShading"
## [107] "BasemapAT"
## [108] "BasemapAT.basemap"
## [109] "BasemapAT.grau"
## [110] "BasemapAT.overlay"
## [111] "BasemapAT.highdpi"
## [112] "BasemapAT.orthofoto"
## [113] "nlmaps"
## [114] "nlmaps.standaard"
## [115] "nlmaps.pastel"
## [116] "nlmaps.grijs"
## [117] "nlmaps.luchtfoto"
## [118] "NASAGIBS"
## [119] "NASAGIBS.ModisTerraTrueColorCR"
## [120] "NASAGIBS.ModisTerraBands367CR"
## [121] "NASAGIBS.ViirsEarthAtNight2012"
## [122] "NASAGIBS.ModisTerraLSTDay"
## [123] "NASAGIBS.ModisTerraSnowCover"
## [124] "NASAGIBS.ModisTerraAOD"
## [125] "NASAGIBS.ModisTerraChlorophyll"
## [126] "NLS"
## [127] "JusticeMap"
## [128] "JusticeMap.income"
## [129] "JusticeMap.americanIndian"
## [130] "JusticeMap.asian"
## [131] "JusticeMap.black"
## [132] "JusticeMap.hispanic"
## [133] "JusticeMap.multi"
## [134] "JusticeMap.nonWhite"
## [135] "JusticeMap.white"
## [136] "JusticeMap.plurality"
## [137] "Wikimedia"
# Use str_detect() to determine if the name of each provider tile contains the string "CartoDB"
str_detect(names(providers), "CartoDB")
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [78] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [89] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
## [100] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE
# Use str_detect() to print only the provider tile names that include the string "CartoDB"
names(providers)[str_detect(names(providers), "CartoDB")]
## [1] "CartoDB" "CartoDB.Positron"
## [3] "CartoDB.PositronNoLabels" "CartoDB.PositronOnlyLabels"
## [5] "CartoDB.DarkMatter" "CartoDB.DarkMatterNoLabels"
## [7] "CartoDB.DarkMatterOnlyLabels"
# Change addTiles() to addProviderTiles() and set the provider argument to "CartoDB"
leaflet() %>%
addProviderTiles("CartoDB")
# Create a leaflet map that uses the Esri provider tile
leaflet() %>%
addProviderTiles("Esri")
# Create a leaflet map that uses the CartoDB.PositronNoLabels provider tile
leaflet() %>%
addProviderTiles("CartoDB.PositronNoLabels")
# Map with CartoDB tile centered on DataCamp's NYC office with zoom of 6
leaflet() %>%
addProviderTiles("CartoDB") %>%
setView(lng = -73.98575, lat = 40.74856, zoom = 6)
dc_hq <- tibble::tibble(hq=c("NYC", "Belgium"), lon=c(-73.98575, 4.71786), lat=c(40.7486, 50.8814))
dc_hq
## # A tibble: 2 x 3
## hq lon lat
## <chr> <dbl> <dbl>
## 1 NYC -74.0 40.7
## 2 Belgium 4.72 50.9
# Map with CartoDB.PositronNoLabels tile centered on DataCamp's Belgium office with zoom of 4
leaflet() %>%
addProviderTiles("CartoDB.PositronNoLabels") %>%
setView(lng = dc_hq$lon[2], lat = dc_hq$lat[2], zoom = 4)
leaflet(options = leafletOptions(
# Set minZoom and dragging
minZoom = 12, dragging = TRUE)) %>%
addProviderTiles("CartoDB") %>%
# Set default zoom level
setView(lng = dc_hq$lon[2], lat = dc_hq$lat[2], zoom = 14) %>%
# Set max bounds of map
setMaxBounds(lng1 = dc_hq$lon[2] + 0.05,
lat1 = dc_hq$lat[2] + .05,
lng2 = dc_hq$lon[2] - 0.05,
lat2 = dc_hq$lat[2] - .05)
# Plot DataCamp's NYC HQ
leaflet() %>%
addProviderTiles("CartoDB") %>%
addMarkers(lng = dc_hq$lon[1], lat = dc_hq$lat[1])
# Plot DataCamp's NYC HQ with zoom of 12
leaflet() %>%
addProviderTiles("CartoDB") %>%
addMarkers(lng = -73.98575, lat = 40.74856) %>%
setView(lng = -73.98575, lat = 40.74856, zoom = 12)
# Plot both DataCamp's NYC and Belgium locations
leaflet() %>%
addProviderTiles("CartoDB") %>%
addMarkers(lng = dc_hq$lon, lat = dc_hq$lat)
# Store leaflet hq map in an object called map
map <- leaflet() %>%
addProviderTiles("CartoDB") %>%
# add hq column of dc_hq as popups
addMarkers(lng = dc_hq$lon, lat = dc_hq$lat,
popup = dc_hq$hq
)
# Center the view of map on the Belgium HQ with a zoom of 5
map_zoom <- map %>%
setView(lat = 50.881363, lng = 4.717863, zoom = 5)
# Print map_zoom
map_zoom
Chapter 2 - Plotting points
Introduction to IPEDS Data:
Mapping California colleges:
Labels and pop-ups:
Color coding colleges:
Example code includes:
# Remove markers, reset bounds, and store the updated map in the m object
map <- map %>%
clearMarkers() %>%
clearBounds()
# Print the cleared map
map
ipedsRaw <- readr::read_csv("./RInputFiles/ipeds.csv")
## Parsed with column specification:
## cols(
## name = col_character(),
## lng = col_double(),
## lat = col_double(),
## state = col_character(),
## sector_label = col_character()
## )
# Remove colleges with missing sector information
ipeds <-
ipedsRaw %>%
tidyr::drop_na()
# Count the number of four-year colleges in each state
ipeds %>%
group_by(state) %>%
count()
## # A tibble: 56 x 2
## # Groups: state [56]
## state n
## <chr> <int>
## 1 AK 6
## 2 AL 45
## 3 AR 26
## 4 AS 1
## 5 AZ 50
## 6 CA 272
## 7 CO 53
## 8 CT 33
## 9 DC 18
## 10 DE 7
## # ... with 46 more rows
# Create a list of US States in descending order by the number of colleges in each state
ipeds %>%
group_by(state) %>%
count() %>%
arrange(desc(n))
## # A tibble: 56 x 2
## # Groups: state [56]
## state n
## <chr> <int>
## 1 CA 272
## 2 NY 239
## 3 PA 164
## 4 FL 159
## 5 TX 154
## 6 OH 135
## 7 IL 119
## 8 MA 103
## 9 MO 87
## 10 MN 82
## # ... with 46 more rows
# Create a dataframe called `ca` with data on only colleges in California
ca <- ipeds %>%
filter(state == "CA")
map <- leaflet() %>%
addProviderTiles("CartoDB")
# Use `addMarkers` to plot all of the colleges in `ca` on the `m` leaflet map
map %>%
addMarkers(lng = ca$lng, lat = ca$lat)
la_coords <- data.frame(lat = 34.05223, lon = -118.2437)
# Center the map on LA
map %>%
addMarkers(data = ca) %>%
setView(lat = la_coords$lat, lng = la_coords$lon, zoom = 12)
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Set the zoom level to 8 and store in the m object
map_zoom <-
map %>%
addMarkers(data = ca) %>%
setView(lat = la_coords$lat, lng = la_coords$lon, zoom = 8)
## Assuming "lng" and "lat" are longitude and latitude, respectively
map_zoom
# Clear the markers from the map
map2 <- map %>% clearMarkers()
# Use addCircleMarkers() to plot each college as a circle
map2 %>%
addCircleMarkers(lng = ca$lng, lat = ca$lat)
# Change the radius of each circle to be 2 pixels and the color to red
map2 %>%
addCircleMarkers(lng = ca$lng, lat = ca$lat, radius = 2, color = "red")
# Add circle markers with popups for college names
map %>%
addCircleMarkers(data = ca, radius = 2, popup = ~name)
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Change circle color to #2cb42c and store map in map_color object
map_color <- map %>%
addCircleMarkers(data = ca, radius = 2, color = "#2cb42c", popup = ~name)
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Print map_color
map_color
# Clear the bounds and markers on the map object and store in map2
map2 <- map %>%
clearBounds() %>%
clearMarkers()
# Add circle markers with popups that display both the institution name and sector
map2 %>%
addCircleMarkers(data = ca, radius = 2,
popup = ~paste0(name, "<br/>", sector_label)
)
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Make the institution name in each popup bold
map2 %>%
addCircleMarkers(data = ca, radius = 2,
popup = ~paste0("<b>", name, "</b>", "<br/>", sector_label)
)
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Add circle markers with labels identifying the name of each college
map %>%
addCircleMarkers(data = ca, radius = 2, label = ~name)
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Use paste0 to add sector information to the label inside parentheses
map %>%
addCircleMarkers(data = ca, radius = 2, label = ~paste0(name, " (", sector_label, ")"))
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Make a color palette called pal for the values of `sector_label` using `colorFactor()`
# Colors should be: "red", "blue", and "#9b4a11" for "Public", "Private", and "For-Profit" colleges, respectively
pal <- colorFactor(palette = c("red", "blue", "#9b4a11"),
levels = c("Public", "Private", "For-Profit")
)
# Add circle markers that color colleges using pal() and the values of sector_label
map2 <- map %>%
addCircleMarkers(data = ca, radius = 2,
color = ~pal(sector_label),
label = ~paste0(name, " (", sector_label, ")")
)
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Print map2
map2
# Add a legend that displays the colors used in pal
map2 %>%
addLegend(pal = pal, values = c("Public", "Private", "For-Profit"))
# Customize the legend
map2 %>%
addLegend(pal = pal,
values = c("Public", "Private", "For-Profit"),
# opacity of .5, title of Sector, and position of topright
opacity = 0.5, title = "Sector", position = "topright"
)
Chapter 3 - Groups, Layers, Extras
Leaflet Extras Package:
Overlay Groups - ability to control the segments that are displayed on the map:
Base Groups - can provide multiple options for toggling (only one may be selected at a time):
Pieces of Flair:
Example code includes:
library(leaflet.extras)
library(htmltools)
leaflet() %>%
addTiles() %>%
addSearchOSM() %>%
addReverseSearchOSM()
m2 <- ipeds %>%
leaflet() %>%
# use the CartoDB provider tile
addProviderTiles("CartoDB") %>%
# center on the middle of the US with zoom of 3
setView(lat = 39.8282, lng = -98.5795, zoom=3)
# Map all American colleges
m2 %>%
addCircleMarkers()
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Create data frame called public with only public colleges
public <- filter(ipeds, sector_label == "Public")
# Create a leaflet map of public colleges called m3
m3 <- leaflet() %>%
addProviderTiles("CartoDB") %>%
addCircleMarkers(data = public, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "Public"
)
## Assuming "lng" and "lat" are longitude and latitude, respectively
m3
# Create data frame called private with only private colleges
private <- filter(ipeds, sector_label == "Private")
# Add private colleges to `m3` as a new layer
m3 <- m3 %>%
addCircleMarkers(data = private, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "Private"
) %>%
addLayersControl(overlayGroups = c("Public", "Private"))
## Assuming "lng" and "lat" are longitude and latitude, respectively
m3
# Create data frame called profit with only for-profit colleges
profit <- filter(ipeds, sector_label == "For-Profit")
# Add for-profit colleges to `m3` as a new layer
m3 <- m3 %>%
addCircleMarkers(data = profit, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "For-Profit"
) %>%
addLayersControl(overlayGroups = c("Public", "Private", "For-Profit"))
## Assuming "lng" and "lat" are longitude and latitude, respectively
# Center the map on the middle of the US with a zoom of 4
m4 <- m3 %>%
setView(lat = 39.8282, lng = -98.5795, zoom = 4)
m4
leaflet() %>%
# Add the OSM, CartoDB and Esri tiles
addTiles(group = "OSM") %>%
addProviderTiles("CartoDB", group = "Carto") %>%
addProviderTiles("Esri", group = "Esri") %>%
# Use addLayersControl to allow users to toggle between basemaps
addLayersControl(baseGroups = c("OSM", "Carto", "Esri"))
m4 <- leaflet() %>%
addTiles(group = "OSM") %>%
addProviderTiles("CartoDB", group = "Carto") %>%
addProviderTiles("Esri", group = "Esri") %>%
addCircleMarkers(data = public, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "Public"
) %>%
addCircleMarkers(data = private, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "Private"
) %>%
addCircleMarkers(data = profit, radius = 2, label = ~htmlEscape(name),
color = ~pal(sector_label), group = "For-Profit"
) %>%
addLayersControl(baseGroups = c("OSM", "Carto", "Esri"),
overlayGroups = c("Public", "Private", "For-Profit")
) %>%
setView(lat = 39.8282, lng = -98.5795, zoom = 4)
## Assuming "lng" and "lat" are longitude and latitude, respectively
## Assuming "lng" and "lat" are longitude and latitude, respectively
## Assuming "lng" and "lat" are longitude and latitude, respectively
m4
ipeds %>%
leaflet() %>%
addTiles() %>%
# Sanitize any html in our labels
addCircleMarkers(radius = 2, label = ~htmlEscape(name),
# Color code colleges by sector using the `pal` color palette
color = ~pal(sector_label),
# Cluster all colleges using `clusterOptions`
clusterOptions = markerClusterOptions()
)
## Assuming "lng" and "lat" are longitude and latitude, respectively
Chapter 4 - Plotting Polygons
Spatial Data - ability to plot polygons rather than points:
Mapping Polygons - can pipe SPDF in to a series of leaflet calls:
Putting Everything Together:
Wrap up - additional resources:
Example code includes:
load("./RInputFiles/nc_zips.Rda")
load("./RInputFiles/wealthiest_zips.Rda")
nc_income <- readr::read_csv("./RInputFiles/mean_income_by_zip_nc.csv")
## Parsed with column specification:
## cols(
## zipcode = col_integer(),
## returns = col_integer(),
## income = col_double(),
## mean_income = col_double()
## )
str(nc_income, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 723 obs. of 4 variables:
## $ zipcode : int 28207 28211 27608 28480 27517 27614 28173 28036 27408 28226 ...
## $ returns : int 4470 14060 5690 1510 12710 15670 21880 7640 9100 19240 ...
## $ income : num 2.46e+09 3.32e+09 1.13e+09 2.41e+08 1.97e+09 ...
## $ mean_income: num 550849 235961 197725 159617 154682 ...
# Print a summary of the `shp` data
summary(shp)
## Loading required package: sp
## Object of class SpatialPolygonsDataFrame
## Coordinates:
## min max
## x -84.32187 -75.46089
## y 33.84232 36.58812
## Is projected: FALSE
## proj4string :
## [+init=epsg:4326 +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84
## +towgs84=0,0,0]
## Data attributes:
## GEOID10 ALAND10
## 27006 : 1 100240769: 1
## 27007 : 1 100252722: 1
## 27009 : 1 1003885 : 1
## 27011 : 1 100620829: 1
## 27012 : 1 100707703: 1
## 27013 : 1 101001856: 1
## (Other):802 (Other) :802
# Print the class of `shp`
class(shp)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
# Print the slot names of `shp`
slotNames(shp)
## [1] "data" "polygons" "plotOrder" "bbox" "proj4string"
# Glimpse the data slot of shp
glimpse(shp@data)
## Observations: 808
## Variables: 2
## $ GEOID10 <fct> 27925, 28754, 28092, 27217, 28711, 28666, 28602, 27841...
## $ ALAND10 <fct> 624688620, 223734670, 317180853, 318965510, 258603117,...
# Print the class of the data slot of shp
class(shp@data)
## [1] "data.frame"
# Print GEOID10
shp@data$GEOID10
## [1] 27925 28754 28092 27217 28711 28666 28602 27841 27831 28785 27504
## [12] 27330 28768 28658 28716 28139 27565 28394 27982 28025 28159 28382
## [23] 28312 28342 27839 27852 28723 28077 28039 28452 27306 28375 28713
## [34] 28743 28717 28150 28447 27205 27379 28425 27827 27540 28114 28451
## [45] 27892 27249 28628 27873 28781 27916 28705 28714 28101 28102 28445
## [56] 28448 28458 28719 28478 28479 28501 28748 28752 28207 28753 28757
## [67] 28209 28212 28560 28504 27983 27985 28018 28019 28562 28906 28530
## [78] 28771 28779 28782 28376 28581 28152 28169 28170 28657 28021 28204
## [89] 28533 28540 28543 28551 28262 28280 28575 28790 28792 28667 28672
## [100] 28108 28462 28681 28465 28734 28739 28694 28697 28702 28745 28127
## [111] 28420 28422 28424 28428 28435 28088 28089 28090 27562 28334 28787
## [122] 28433 27360 27534 28043 27370 28444 27531 28675 28712 28449 27053
## [133] 27944 28367 28326 28740 28659 28282 27244 27597 27017 28761 28457
## [144] 28441 27956 27889 28652 28146 28513 28777 28786 27596 27530 28369
## [155] 28327 27340 27028 27823 27879 28244 27810 27886 28306 27025 27239
## [166] 27967 27824 27826 27834 27030 28358 28365 27520 27524 27525 27526
## [177] 27292 27874 27882 27883 27885 27253 27576 27577 27582 27295 27298
## [188] 27332 27910 27052 27055 27344 27516 27850 27856 27265 27603 27605
## [199] 27537 27539 27541 28601 28604 27809 27278 27284 27371 27201 27312
## [210] 28320 28325 27207 28330 28607 28611 28612 27549 27555 27317 27320
## [221] 27703 27709 28350 28643 28337 28621 27569 28645 28651 27948 28630
## [232] 27923 27929 27936 27943 28721 28512 27546 27891 28379 27822 27909
## [243] 28655 28662 27587 27589 28625 28742 28553 27941 28134 27043 27893
## [254] 28328 28135 28007 28338 27110 28472 28756 28110 28519 27861 27407
## [265] 28374 28211 28668 27214 27965 27949 27806 28340 27917 27288 27563
## [276] 28669 27229 27283 27109 27843 27047 28303 28585 28676 28689 28305
## [287] 28635 28640 27016 27863 27968 28528 27915 27981 28411 28577 27326
## [298] 27954 28556 27105 27545 27813 27974 27301 28168 28670 28801 27050
## [309] 28610 28665 28125 28538 27849 28036 28586 27801 27807 28904 27875
## [320] 28557 27958 28468 27536 28213 28341 28747 28707 27262 28006 28360
## [331] 28031 27845 28166 28616 27572 27014 27503 27011 28572 28386 27291
## [342] 28432 27804 27343 28073 28467 28173 28539 28352 27828 28515 28555
## [353] 27855 27583 28310 28396 28348 28138 28642 27542 27408 28215 27821
## [364] 28105 28270 28206 28301 27876 28627 27019 28574 28647 28806 27349
## [375] 28091 28660 28726 28508 27840 28803 28511 27964 27978 28086 27927
## [386] 28774 28383 27559 28523 28332 28749 27962 27455 28056 27501 28027
## [397] 27527 27282 27837 28682 27310 28356 27233 27231 27006 28144 27857
## [408] 27042 28314 27612 28525 27281 28147 28366 28629 27523 27937 28119
## [419] 28012 27048 27880 27350 27027 27606 27938 28638 28720 28580 27103
## [430] 27986 28001 28034 28393 28032 28040 28677 28395 28391 28678 28399
## [441] 28455 28098 28401 28103 28684 28685 28409 28071 28683 28083 28708
## [452] 28097 28450 28431 28453 28454 28709 28439 28377 28715 28443 28436
## [463] 28438 28751 28129 28133 28763 28109 28120 28466 28746 28137 28480
## [474] 28759 28731 28762 28405 28054 28698 28081 28403 28052 28701 28690
## [485] 28412 28704 28078 28421 28693 28544 28516 28773 28775 28905 28174
## [496] 28203 28570 28208 28210 28202 28804 28805 28791 28901 28547 28107
## [507] 28722 28729 28461 28730 28463 28552 28554 28115 28732 28112 28214
## [518] 28733 28308 28304 28571 28584 28582 28583 28273 28587 28278 28578
## [529] 28579 28323 28164 28605 28518 28520 28526 28783 28529 28167 28521
## [540] 28531 28311 28163 28537 28772 28626 27942 27928 28634 28649 28339
## [551] 28357 27935 28623 28618 28654 28624 28619 27922 28307 28226 27946
## [562] 27947 28347 28349 28227 28637 27926 27920 28646 28573 27921 28351
## [573] 28269 28590 27341 28364 27604 27976 28615 27357 28344 28613 28609
## [584] 28343 27409 27376 27377 27701 27610 27979 27405 27704 27705 27959
## [595] 27960 27403 27966 27953 27970 27972 27973 27707 27957 27401 27517
## [606] 27502 27507 27508 27509 27510 27518 27505 27020 27613 27024 27514
## [617] 27519 27713 27614 27803 27616 27617 27513 27511 27023 27046 27844
## [628] 27869 27853 27051 27041 27521 27871 27872 27842 27106 27830 27846
## [639] 27013 27862 27104 27832 27847 27858 27865 27851 27825 27829 27012
## [650] 27816 27817 27557 27808 27209 27208 27820 27888 27814 27551 27556
## [661] 27045 27235 27560 27215 27054 27248 27242 27260 27243 27258 27581
## [672] 27812 27601 27592 27591 27544 27316 27313 27325 27314 27311 27896
## [683] 27007 28650 28606 27009 28735 28673 28725 28033 27870 27864 28429
## [694] 28384 28663 27022 28333 27574 28524 28527 28277 27263 28023 27573
## [705] 27615 28020 28464 28128 28009 28205 28104 27299 27884 28076 28080
## [716] 28160 28532 27302 28124 27932 27924 28037 27819 27608 28789 28079
## [727] 28398 27553 27878 27018 27040 28392 27315 28594 27950 28442 27410
## [738] 27805 28371 27305 28778 28692 28072 28456 28589 28363 27355 27358
## [749] 28385 28736 27890 27522 28617 28671 28387 28390 27212 27609 27568
## [760] 28679 27881 27101 28622 28644 28631 28636 28373 28345 27712 28117
## [771] 27866 27021 27406 28741 28372 27897 28430 27980 28017 27203 28909
## [782] 27127 27607 27939 28217 28216 27252 28423 28718 27919 28510 28460
## [793] 28434 28470 28766 28546 27818 27529 28469 28016 28075 28318 27107
## [804] 27356 28315 27571 27860 28902
## 33144 Levels: 00601 00602 00603 00606 00610 00612 00616 00617 00622 ... 99929
shp@data$GEOID10 <- as.integer(as.character(shp@data$GEOID10))
str(shp@data$GEOID10)
## int [1:808] 27925 28754 28092 27217 28711 28666 28602 27841 27831 28785 ...
# Glimpse the nc_income data
glimpse(nc_income)
## Observations: 723
## Variables: 4
## $ zipcode <int> 28207, 28211, 27608, 28480, 27517, 27614, 28173, 2...
## $ returns <int> 4470, 14060, 5690, 1510, 12710, 15670, 21880, 7640...
## $ income <dbl> 2462295000, 3317607000, 1125055000, 241022000, 196...
## $ mean_income <dbl> 550849.0, 235960.7, 197725.0, 159617.2, 154682.2, ...
# Summarise the nc_income data
summary(nc_income)
## zipcode returns income mean_income
## Min. :27006 Min. : 110 Min. :4.557e+06 Min. : 26625
## 1st Qu.:27605 1st Qu.: 1105 1st Qu.:4.615e+07 1st Qu.: 40368
## Median :28115 Median : 3050 Median :1.526e+08 Median : 46288
## Mean :28062 Mean : 5979 Mean :3.648e+08 Mean : 53338
## 3rd Qu.:28521 3rd Qu.: 9050 3rd Qu.:4.670e+08 3rd Qu.: 55917
## Max. :28909 Max. :37020 Max. :3.970e+09 Max. :550849
# Left join nc_income onto shp@data and store in shp_nc_income
shp_nc_income <- shp@data %>%
left_join(nc_income, by = c("GEOID10" = "zipcode"))
# Print the number of missing values of each variable in shp_nc_income
shp_nc_income %>%
summarise_all(funs(sum(is.na(.))))
## GEOID10 ALAND10 returns income mean_income
## 1 0 0 85 85 85
shp <- merge(shp, shp_nc_income, by=c("GEOID10", "ALAND10"))
# map the polygons in shp
shp %>%
leaflet() %>%
addTiles() %>%
addPolygons()
# which zips were not in the income data?
shp_na <- shp[is.na(shp$mean_income),]
# map the polygons in shp_na
shp_na %>%
leaflet() %>%
addTiles() %>%
addPolygons()
# summarise the mean income variable
summary(shp$mean_income)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 26625 40368 46288 53338 55917 550849 85
# subset shp to include only zip codes in the top quartile of mean income
high_inc <- shp[!is.na(shp$mean_income) & shp$mean_income > 55917,]
# map the boundaries of the zip codes in the top quartile of mean income
high_inc %>%
leaflet() %>%
addTiles() %>%
addPolygons()
dollar <- function (x, negative_parens=TRUE, prefix="$", suffix="") {
# KLUGE to make this work . . .
needs_cents <- function(...) { FALSE }
if (length(x) == 0)
return(character())
x <- plyr::round_any(x, 0.01)
if (needs_cents(x, largest_with_cents)) {
nsmall <- 2L
}
else {
x <- plyr::round_any(x, 1)
nsmall <- 0L
}
negative <- !is.na(x) & x < 0
if (negative_parens) {
x <- abs(x)
}
amount <- format(abs(x), nsmall = nsmall, trim = TRUE, big.mark = ",", scientific = FALSE, digits = 1L)
if (negative_parens) {
paste0(ifelse(negative, "(", ""), prefix, amount, suffix, ifelse(negative, ")", ""))
}
else {
paste0(prefix, ifelse(negative, "-", ""), amount, suffix)
}
}
# create color palette with colorNumeric()
nc_pal <- colorNumeric("YlGn", domain = high_inc@data$mean_income)
high_inc %>%
leaflet() %>%
addTiles() %>%
# set boundary thickness to 1 and color polygons blue
addPolygons(weight = 1, color = ~nc_pal(mean_income),
# add labels that display mean income
label = ~paste0("Mean Income: ", dollar(mean_income)),
# highlight polygons on hover
highlight = highlightOptions(weight = 5, color = "white",
bringToFront = TRUE))
# Create a logged version of the nc_pal color palette
nc_pal <- colorNumeric("YlGn", domain = log(high_inc@data$mean_income))
# apply the nc_pal
high_inc %>%
leaflet() %>%
addProviderTiles("CartoDB") %>%
addPolygons(weight = 1, color = ~nc_pal(log(mean_income)), fillOpacity = 1,
label = ~paste0("Mean Income: ", dollar(mean_income)),
highlightOptions = highlightOptions(weight = 5, color = "white", bringToFront = TRUE))
# Print the slot names of `wealthy_zips`
slotNames(wealthy_zips)
## [1] "data" "polygons" "plotOrder" "bbox" "proj4string"
# Print a summary of the `mean_income` variable
summary(wealthy_zips$mean_income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 200444 229914 279330 339859 371904 2553591
# plot zip codes with mean incomes >= $200k
wealthy_zips %>%
leaflet() %>%
addProviderTiles("CartoDB") %>%
addPolygons(weight = 1, fillOpacity = .7, color = "Green", group = "Wealthy Zipcodes",
label = ~paste0("Mean Income: ", dollar(mean_income)),
highlightOptions = highlightOptions(weight = 5, color = "white", bringToFront = TRUE))
# Add polygons using wealthy_zips
final_map <- m4 %>%
addPolygons(data = wealthy_zips, weight = 1, fillOpacity = .5, color = "Grey", group = "Wealthy Zip Codes",
label = ~paste0("Mean Income: ", dollar(mean_income)),
highlight = highlightOptions(weight = 5, color = "white", bringToFront = TRUE)) %>%
# Update layer controls including "Wealthy Zip Codes"
addLayersControl(baseGroups = c("OSM", "Carto", "Esri"),
overlayGroups = c("Public", "Private", "For-Profit", "Wealthy Zip Codes"))
# Print and explore your very last map of the course!
final_map
Chapter 1 - Introduction
Sugar content of soft drinks:
Generating a linearly separable dataset
Example code includes:
df <- data.frame(sample=1:25,
sugar_content=c(10.9, 10.9, 10.6, 10, 8, 8.2, 8.6, 10.9, 10.7, 8, 7.7, 7.8, 8.4, 11.5, 11.2, 8.9, 8.7, 7.4, 10.9, 10, 11.4, 10.8, 8.5, 8.2, 10.6)
)
str(df)
## 'data.frame': 25 obs. of 2 variables:
## $ sample : int 1 2 3 4 5 6 7 8 9 10 ...
## $ sugar_content: num 10.9 10.9 10.6 10 8 8.2 8.6 10.9 10.7 8 ...
#print variable names
names(df)
## [1] "sample" "sugar_content"
#build plot
plot_ <- ggplot(data = df, aes(x = sugar_content, y = c(0))) +
geom_point() +
geom_text(label = df$sugar_content, size = 2.5, vjust = 2, hjust = 0.5)
#display plot
plot_
#The maximal margin separator is at the midpoint of the two extreme points in each cluster.
mm_separator <- (8.9 + 10)/2
#create data frame
separator <- data.frame(sep = c(mm_separator))
#add ggplot layer
plot_ <- plot_ +
geom_point(data = separator, x = separator$sep, y = c(0), color = "blue", size = 4)
#display plot
plot_
#set seed
set.seed(42)
#set number of data points.
n <- 600
#Generate data frame with two uniformly distributed predictors lying between 0 and 1.
df <- data.frame(x1 = runif(n), x2 = runif(n))
#classify data points depending on location
df$y <- factor(ifelse(df$x2 - 1.4*df$x1 < 0, -1, 1), levels = c(-1, 1))
#set margin
delta <- 0.07
# retain only those points that lie outside the margin
df1 <- df[abs(1.4*df$x1 - df$x2) > delta, ]
#build plot
plot_margins <- ggplot(data = df1, aes(x = x1, y = x2, color = y)) + geom_point() +
scale_color_manual(values = c("red", "blue")) +
geom_abline(slope = 1.4, intercept = 0)+
geom_abline(slope = 1.4, intercept = delta, linetype = "dashed") +
geom_abline(slope = 1.4, intercept = -delta, linetype = "dashed")
#display plot
plot_margins
Chapter 2 - Support Vector Classifiers - Linear Kernels
Linear Support Vector Machines:
Visualizing Linear SVM:
Tuning Linear SVM:
Multi-class problems:
Example code includes:
dfOld <- df
delta <- 0.07
df <- df[abs(1.4*df$x1 - df$x2) > delta, ]
#split train and test data in an 80/20 proportion
df[, "train"] <- ifelse(runif(nrow(df))<0.8, 1, 0)
#assign training rows to data frame trainset
trainset <- df[df$train == 1, ]
#assign test rows to data frame testset
testset <- df[df$train == 0, ]
#find index of "train" column
trainColNum <- grep("train", names(df))
#remove "train" column from train and test dataset
trainset <- trainset[, -trainColNum]
testset <- testset[, -trainColNum]
library(e1071)
#build svm model, setting required parameters
svm_model<- svm(y ~ .,
data = trainset,
type = "C-classification",
kernel = "linear",
scale = FALSE)
#list components of model
names(svm_model)
## [1] "call" "type" "kernel"
## [4] "cost" "degree" "gamma"
## [7] "coef0" "nu" "epsilon"
## [10] "sparse" "scaled" "x.scale"
## [13] "y.scale" "nclasses" "levels"
## [16] "tot.nSV" "nSV" "labels"
## [19] "SV" "index" "rho"
## [22] "compprob" "probA" "probB"
## [25] "sigma" "coefs" "na.action"
## [28] "fitted" "decision.values" "terms"
#list values of the SV, index and rho
svm_model$SV
## x1 x2
## 11 0.4577417762 0.476919189
## 19 0.4749970816 0.486642912
## 45 0.4317512489 0.520339758
## 58 0.1712643304 0.100229354
## 61 0.6756072745 0.772399305
## 69 0.6932048204 0.838569788
## 99 0.7439746463 0.912029979
## 101 0.6262453445 0.765520479
## 103 0.2165673110 0.202548483
## 118 0.3556659538 0.298152283
## 143 0.4640695513 0.535269056
## 144 0.7793681615 0.941694443
## 147 0.1701624813 0.050030747
## 173 0.4140496817 0.380267640
## 176 0.1364903601 0.011009041
## 180 0.7690324257 0.951921815
## 194 0.1290892835 0.021196302
## 199 0.7431877197 0.824081728
## 204 0.4427962683 0.532290264
## 209 0.2524584394 0.281511990
## 226 0.8205145481 0.962842692
## 253 0.2697161783 0.288755647
## 268 0.2050496121 0.182046106
## 272 0.7853494422 0.870432480
## 278 0.4037828147 0.476424339
## 286 0.1709963905 0.164468810
## 294 0.3864540118 0.370921416
## 295 0.3324459905 0.382318948
## 325 0.5648222226 0.618285144
## 338 0.3169501573 0.333509587
## 341 0.4091320913 0.496387038
## 344 0.3597852497 0.345139100
## 393 0.6568108753 0.815567016
## 400 0.0755990995 0.007417523
## 406 0.1079870730 0.022227321
## 413 0.2401496081 0.151690785
## 427 0.4664852461 0.464965629
## 443 0.3626018071 0.369346223
## 450 0.0619409799 0.011438249
## 466 0.6399842701 0.695480783
## 479 0.1730011790 0.136427131
## 503 0.5195604505 0.627322678
## 525 0.6494539515 0.833293378
## 526 0.6903516576 0.790328991
## 535 0.4243346907 0.470753220
## 590 0.7148487861 0.902375512
## 595 0.8058112133 0.937903824
## 600 0.4587231132 0.446819442
## 15 0.4622928225 0.839631285
## 29 0.4469696281 0.721333573
## 37 0.0073341469 0.108096598
## 38 0.2076589728 0.519075874
## 59 0.2610879638 0.472588875
## 90 0.3052183695 0.548420829
## 92 0.0002388966 0.122946701
## 102 0.2171576982 0.505044580
## 104 0.3889450287 0.717138722
## 129 0.2335235255 0.439058027
## 132 0.6034740848 0.958318281
## 133 0.6315072989 0.970767964
## 158 0.0290858189 0.148069276
## 175 0.4274944656 0.725024226
## 178 0.5923042425 0.900228734
## 189 0.1333296183 0.390023998
## 196 0.0531294835 0.276241161
## 202 0.5171110556 0.899924811
## 210 0.2596899802 0.503687580
## 215 0.4513108502 0.743930877
## 229 0.0483467767 0.218475638
## 232 0.1590223818 0.402696270
## 237 0.0865806018 0.263718613
## 239 0.5545858634 0.935806216
## 249 0.4992728804 0.812805236
## 258 0.5397982858 0.932383237
## 276 0.3367135401 0.672058288
## 293 0.3152607968 0.625878707
## 309 0.3199476011 0.541676977
## 311 0.1078112544 0.374908455
## 378 0.1084886545 0.376079086
## 409 0.0842775232 0.235715229
## 419 0.4264662997 0.798480970
## 420 0.0600483362 0.298929408
## 430 0.5141573721 0.908452330
## 451 0.4309255683 0.821331850
## 477 0.5964720468 0.913432184
## 481 0.2329343846 0.409654615
## 482 0.5770482090 0.969947845
## 488 0.2485451805 0.533491509
## 520 0.5784583788 0.907620618
## 524 0.1270027745 0.348539336
## 530 0.2665205784 0.458110426
## 540 0.2131855546 0.530223881
## 558 0.2770604359 0.510976796
## 562 0.2056735931 0.433566746
## 580 0.5705413527 0.994652604
## 581 0.2458533479 0.494881822
svm_model$index
## [1] 8 11 30 39 42 48 71 73 75 85 103 104 105 124 127 131 141
## [18] 145 149 153 167 188 196 199 204 208 215 216 240 251 254 257 290 297
## [35] 300 306 317 329 335 347 353 372 390 391 398 440 443 445 9 18 23
## [52] 24 40 63 65 74 76 92 95 96 114 126 129 137 143 148 154 157
## [69] 170 173 178 180 186 192 202 214 228 229 281 303 310 311 318 336 351
## [86] 355 356 362 385 389 395 402 415 419 431 432
svm_model$rho
## [1] -0.1641859
#compute training accuracy
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)
## [1] 1
#compute test accuracy
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 1
#build scatter plot of training dataset
scatter_plot <- ggplot(data = trainset, aes(x = x1, y = x2, color = y)) +
geom_point() +
scale_color_manual(values = c("red", "blue"))
#add plot layer marking out the support vectors
layered_plot <-
scatter_plot + geom_point(data = trainset[svm_model$index, ], aes(x = x1, y = x2), color = "purple", size = 4, alpha = 0.5)
#display plot
layered_plot
#calculate slope and intercept of decision boundary from weight vector and svm model
w <- c(x1=6.55241, x2=-4.73278) # calculated manually outside of this module
slope_1 <- -w[1]/w[2]
intercept_1 <- svm_model$rho/w[2]
#build scatter plot of training dataset
scatter_plot <- ggplot(data = trainset, aes(x = x1, y = x2, color = y)) +
geom_point() + scale_color_manual(values = c("red", "blue"))
#add decision boundary
plot_decision <- scatter_plot + geom_abline(slope = slope_1, intercept = intercept_1)
#add margin boundaries
plot_margins <- plot_decision +
geom_abline(slope = slope_1, intercept = intercept_1 - 1/w[2], linetype = "dashed")+
geom_abline(slope = slope_1, intercept = intercept_1 + 1/w[2], linetype = "dashed")
#display plot
plot_margins
#build svm model
svm_model<-
svm(y ~ ., data = trainset, type = "C-classification",
kernel = "linear", scale = FALSE)
#plot decision boundaries and support vectors
plot(x = svm_model, data = trainset)
#build svm model, cost = 1
svm_model_1 <- svm(y ~ .,
data = trainset,
type = "C-classification",
cost = 1,
kernel = "linear",
scale = FALSE)
#print model details
svm_model_1
##
## Call:
## svm(formula = y ~ ., data = trainset, type = "C-classification",
## cost = 1, kernel = "linear", scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
## gamma: 0.5
##
## Number of Support Vectors: 96
#build svm model, cost = 100
svm_model_100 <- svm(y ~ .,
data = trainset,
type = "C-classification",
cost = 100,
kernel = "linear",
scale = FALSE)
#print model details
svm_model_100
##
## Call:
## svm(formula = y ~ ., data = trainset, type = "C-classification",
## cost = 100, kernel = "linear", scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 100
## gamma: 0.5
##
## Number of Support Vectors: 6
# Create the base train_plot
train_plot <- ggplot(data = trainset, aes(x = x1, y = x2, color = y)) +
geom_point() + scale_color_manual(values = c("red", "blue"))
w_1 <- c(x1=6.55241, x2=-4.73278) # calculated manually outside of this module
w_100 <- c(x1=18.3097, x2=-13.09972) # calculated manually outside of this module
intercept_1 <- -0.005515526 # calculated outside of this module
intercept_100 <- 0.001852543 # calculated outside of this module
slope_1 <- -w_1[1]/w_1[2]
slope_100 <- -w_100[1]/w_100[2]
#add decision boundary and margins for cost = 1 to training data scatter plot
train_plot_with_margins <- train_plot +
geom_abline(slope = slope_1, intercept = intercept_1) +
geom_abline(slope = slope_1, intercept = intercept_1 - 1/w_1[2], linetype = "dashed")+
geom_abline(slope = slope_1, intercept = intercept_1 + 1/w_1[2], linetype = "dashed")
#display plot
train_plot_with_margins
#add decision boundary and margins for cost = 100 to training data scatter plot
train_plot_with_margins <- train_plot_with_margins +
geom_abline(slope = slope_100, intercept = intercept_100, color = "goldenrod") +
geom_abline(slope = slope_100, intercept = intercept_100 - 1/w_100[2], linetype = "dashed", color = "goldenrod")+
geom_abline(slope = slope_100, intercept = intercept_100 + 1/w_100[2], linetype = "dashed", color = "goldenrod")
#display plot
train_plot_with_margins
svm_model<- svm(y ~ ., data = trainset, type = "C-classification", kernel = "linear", scale = FALSE)
#compute training accuracy
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)
## [1] 1
#compute test accuracy
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 1
#plot
plot(svm_model, trainset)
data(iris)
nTrials <- 100
accuracy <- numeric(nTrials)
#calculate accuracy for n distinct 80/20 train/test partitions
for (i in 1:nTrials){
iris[, "train"] <- ifelse(runif(nrow(iris))<0.8, 1, 0)
trainColNum <- grep("train", names(iris))
trainset <- iris[iris$train == 1, -trainColNum]
testset <- iris[iris$train == 0, -trainColNum]
svm_model <- svm(Species~ ., data = trainset,
type = "C-classification", kernel = "linear")
pred_test <- predict(svm_model, testset)
accuracy[i] <- mean(pred_test == testset$Species)
}
#mean accuracy and standard deviation
mean(accuracy)
## [1] 0.9643194
sd(accuracy)
## [1] 0.03704363
Chapter 3 - Polynomial Kernels
Generating radially separable datasets:
Linear SVM on radially separable datasets:
Kernel trick - devise a mathematical transformation that makes the data linearly separable:
Tuning SVM:
Example code includes:
#set number of variables and seed
n <- 400
set.seed(1)
#Generate data frame with two uniformly distributed predictors, x1 and x2
df <- data.frame(x1 = runif(n, min = -1, max = 1), x2 = runif(n, min = -1, max = 1))
#We want a circular boundary. Set boundary radius
radius <- 0.8
radius_squared <- radius^2
#create dependent categorical variable, y, with value -1 or 1 depending on whether point lies
#within or outside the circle.
df$y <- factor(ifelse(df$x1**2 + df$x2**2 < radius_squared, -1, 1), levels = c(-1, 1))
#build scatter plot, distinguish class by color
scatter_plot <- ggplot(data = df, aes(x = x1, y = x2, color = y)) +
geom_point() +
scale_color_manual(values = c("red", "blue"))
#display plot
scatter_plot
inTrain <- sample(1:nrow(df), round(0.75*nrow(df)), replace=FALSE)
trainset <- df[sort(inTrain), ]
testset <- df[-inTrain, ]
#default cost mode;
svm_model_1 <- svm(y ~ ., data = trainset, type = "C-classification", cost = 1, kernel = "linear")
#training accuracy
pred_train <- predict(svm_model_1, trainset)
mean(pred_train == trainset$y)
## [1] 0.64
#test accuracy
pred_test <- predict(svm_model_1, testset)
mean(pred_test == testset$y)
## [1] 0.48
#cost = 100 model
svm_model_100 <- svm(y ~ ., data = trainset, type = "C-classification", cost = 100, kernel = "linear")
#accuracy
pred_train <- predict(svm_model_100, trainset)
mean(pred_train == trainset$y)
## [1] 0.64
pred_test <- predict(svm_model_100, testset)
mean(pred_test == testset$y)
## [1] 0.48
#print average accuracy and standard deviation
accuracy <- rep(NA, 100)
set.seed(2)
#comment
for (i in 1:100){
df[, "train"] <- ifelse(runif(nrow(df))<0.8, 1, 0)
trainset <- df[df$train == 1, ]
testset <- df[df$train == 0, ]
trainColNum <- grep("train", names(trainset))
trainset <- trainset[, -trainColNum]
testset <- testset[, -trainColNum]
svm_model<- svm(y ~ ., data = trainset, type = "C-classification", kernel = "linear")
pred_test <- predict(svm_model, testset)
accuracy[i] <- mean(pred_test == testset$y)
}
#print average accuracy and standard deviation
mean(accuracy)
## [1] 0.5554571
sd(accuracy)
## [1] 0.04243524
#transform data
df1 <- data.frame(x1sq = df$x1^2, x2sq = df$x2^2, y = df$y)
#plot data points in the transformed space
plot_transformed <- ggplot(data = df1, aes(x = x1sq, y = x2sq, color = y)) +
geom_point()+ guides(color = FALSE) +
scale_color_manual(values = c("red", "blue"))
#add decision boundary and visualize
plot_decision <- plot_transformed + geom_abline(slope = -1, intercept = 0.64)
plot_decision
# Still want to use the old (non-squared) data
inTrain <- sample(1:nrow(df), round(0.75*nrow(df)), replace=FALSE)
df$train <- NULL
trainset <- df[sort(inTrain), ]
testset <- df[-inTrain, ]
svm_model <- svm(y ~ ., data = trainset, type = "C-classification", kernel = "polynomial", degree = 2)
#measure training and test accuracy
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)
## [1] 0.9866667
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 0.98
#plot
plot(svm_model, trainset)
#tune model
tune_out <-
tune.svm(x = trainset[, -3], y = trainset[, 3],
type = "C-classification",
kernel = "polynomial", degree = 2, cost = 10^(-1:2),
gamma = c(0.1, 1, 10), coef0 = c(0.1, 1, 10))
#list optimal values
tune_out$best.parameters$cost
## [1] 0.1
tune_out$best.parameters$gamma
## [1] 10
tune_out$best.parameters$coef0
## [1] 0.1
#Build tuned model
svm_model <- svm(y ~ ., data = trainset, type = "C-classification",
kernel = "polynomial", degree = 2,
cost = tune_out$best.parameters$cost,
gamma = tune_out$best.parameters$gamma,
coef0 = tune_out$best.parameters$coef0)
#Calculate training and test accuracies
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)
## [1] 0.9966667
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 1
#plot model
plot(svm_model, trainset)
Chapter 4 - Radial Basis Kernel Functions
Generating complex datasets:
Motivating the RBF kernel:
The RBF kernel simulates some of the principles of kNN using exponential decay:
Example code includes:
#number of data points
n <- 1000
#set seed
set.seed(1)
#create dataframe
df <- data.frame(x1 = rnorm(n, mean = -0.5, sd = 1), x2 = runif(n, min = -1, max = 1))
#set radius and centers
radius <- 0.8
center_1 <- c(-0.8, 0)
center_2 <- c(0.8, 0)
radius_squared <- radius^2
#create binary classification variable
df$y <- factor(ifelse((df$x1-center_1[1])^2 + (df$x2-center_1[2])^2 < radius_squared |
(df$x1-center_2[1])^2 + (df$x2-center_2[2])^2 < radius_squared, -1, 1),
levels = c(-1, 1))
#create scatter plot
scatter_plot<- ggplot(data = df, aes(x = x1, y = x2, color = y)) +
geom_point() +
scale_color_manual(values = c("red", "blue"))
scatter_plot
# Create 75/25 split for train/test
inTrain <- sample(1:nrow(df), round(0.75*nrow(df)), replace=FALSE)
trainset <- df[sort(inTrain), ]
testset <- df[-inTrain, ]
#build model
svm_model <- svm(y ~ ., data = trainset, type = "C-classification", kernel = "linear")
#accuracy
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)
## [1] 0.5853333
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 0.564
#plot model against testset
plot(svm_model, testset)
#build model
svm_model <- svm(y ~ ., data = trainset, type = "C-classification", kernel = "polynomial", degree = 2)
#accuracy
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)
## [1] 0.8253333
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 0.788
#plot model
plot(svm_model, trainset)
#create vector to store accuracies and set random number seed
accuracy <- rep(NA, 100)
set.seed(2)
# Create a dummy frame dfDum for use in the for loop
dfDum <- df
#calculate accuracies for 100 training/test partitions
for (i in 1:100){
dfDum[, "train"] <- ifelse(runif(nrow(dfDum))<0.8, 1, 0)
trainset <- dfDum[dfDum$train == 1, ]
testset <- dfDum[dfDum$train == 0, ]
trainColNum <- grep("train", names(trainset))
trainset <- trainset[, -trainColNum]
testset <- testset[, -trainColNum]
svm_model<- svm(y ~ ., data = trainset, type = "C-classification", kernel = "polynomial", degree = 2)
pred_test <- predict(svm_model, testset)
accuracy[i] <- mean(pred_test == testset$y)
}
#print average accuracy and standard deviation
mean(accuracy)
## [1] 0.804765
sd(accuracy)
## [1] 0.02398396
#create vector to store accuracies and set random number seed
accuracy <- rep(NA, 100)
set.seed(2)
#calculate accuracies for 100 training/test partitions
for (i in 1:100){
dfDum[, "train"] <- ifelse(runif(nrow(dfDum))<0.8, 1, 0)
trainset <- dfDum[dfDum$train == 1, ]
testset <- dfDum[dfDum$train == 0, ]
trainColNum <- grep("train", names(trainset))
trainset <- trainset[, -trainColNum]
testset <- testset[, -trainColNum]
svm_model<- svm(y ~ ., data = trainset, type = "C-classification", kernel = "radial")
pred_test <- predict(svm_model, testset)
accuracy[i] <- mean(pred_test == testset$y)
}
#print average accuracy and standard deviation
mean(accuracy)
## [1] 0.9034203
sd(accuracy)
## [1] 0.01786378
# Re-create original 75/25 split for train/test
inTrain <- sample(1:nrow(df), round(0.75*nrow(df)), replace=FALSE)
trainset <- df[sort(inTrain), ]
testset <- df[-inTrain, ]
#tune model
tune_out <- tune.svm(x = trainset[, -3], y = trainset[, 3],
gamma = 5*10^(-2:2),
cost = c(0.01, 0.1, 1, 10, 100),
type = "C-classification", kernel = "radial")
tune_out
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 5 1
##
## - best performance: 0.04
#build tuned model
svm_model <- svm(y~ ., data = trainset, type = "C-classification", kernel = "radial",
cost = tune_out$best.parameters$cost,
gamma = tune_out$best.parameters$gamma)
#calculate test accuracy
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 0.956
#Plot decision boundary against test data
plot(svm_model, testset)
Chapter 1 - Introduction to Experimental Design
Introduction to experimental design:
Hypothesis testing:
Example code includes:
# load the ToothGrowth dataset
data("ToothGrowth")
#perform a two-sided t-test
t.test(x = ToothGrowth$len, alternative = "two.sided", mu = 18)
##
## One Sample t-test
##
## data: ToothGrowth$len
## t = 0.82361, df = 59, p-value = 0.4135
## alternative hypothesis: true mean is not equal to 18
## 95 percent confidence interval:
## 16.83731 20.78936
## sample estimates:
## mean of x
## 18.81333
#perform a t-test
ToothGrowth_ttest <- t.test(len ~ supp, data = ToothGrowth)
#tidy the t-test model object
broom::tidy(ToothGrowth_ttest)
## # A tibble: 1 x 10
## estimate estimate1 estimate2 statistic p.value parameter conf.low
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.70 20.7 17.0 1.92 0.0606 55.3 -0.171
## # ... with 3 more variables: conf.high <dbl>, method <chr>,
## # alternative <chr>
#group by supp, dose, then examine how many observations in ToothGrowth there are by those groups
ToothGrowth %>%
group_by(supp, dose) %>%
summarize(n=n())
## # A tibble: 6 x 3
## # Groups: supp [?]
## supp dose n
## <fct> <dbl> <int>
## 1 OJ 0.5 10
## 2 OJ 1 10
## 3 OJ 2 10
## 4 VC 0.5 10
## 5 VC 1 10
## 6 VC 2 10
#create a boxplot with geom_boxplot()
ggplot(ToothGrowth, aes(x=as.factor(dose), y=len)) +
geom_boxplot()
#create the ToothGrowth_aov model object
ToothGrowth_aov <- aov(len ~ dose + supp, data = ToothGrowth)
#examine the model object with summary()
summary(ToothGrowth_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## dose 1 2224.3 2224.3 123.99 6.31e-16 ***
## supp 1 205.3 205.3 11.45 0.0013 **
## Residuals 57 1022.6 17.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#less than
t.test(x = ToothGrowth$len, alternative = "less", mu = 18)
##
## One Sample t-test
##
## data: ToothGrowth$len
## t = 0.82361, df = 59, p-value = 0.7933
## alternative hypothesis: true mean is less than 18
## 95 percent confidence interval:
## -Inf 20.46358
## sample estimates:
## mean of x
## 18.81333
#greater than
t.test(x = ToothGrowth$len, alternative = "greater", mu = 18)
##
## One Sample t-test
##
## data: ToothGrowth$len
## t = 0.82361, df = 59, p-value = 0.2067
## alternative hypothesis: true mean is greater than 18
## 95 percent confidence interval:
## 17.16309 Inf
## sample estimates:
## mean of x
## 18.81333
#calculate power
pwr::pwr.t.test(n = 100, d = 0.35, sig.level = 0.10, type = "two.sample",
alternative = "two.sided", power = NULL
)
##
## Two-sample t test power calculation
##
## n = 100
## d = 0.35
## sig.level = 0.1
## power = 0.7943532
## alternative = two.sided
##
## NOTE: n is number in *each* group
#calculate sample size
pwr::pwr.t.test(n = NULL, d = 0.25, sig.level = 0.05,
type = "one.sample", alternative = "greater", power = 0.8
)
##
## One-sample t test power calculation
##
## n = 100.2877
## d = 0.25
## sig.level = 0.05
## power = 0.8
## alternative = greater
Chapter 2 - Basic Experiments
Single and Multiple Factor Experiments:
Model Validation:
A/B Testing:
Example code includes:
lendingclub <- readr::read_csv("./RInputFiles/lendclub.csv")
## Parsed with column specification:
## cols(
## member_id = col_integer(),
## loan_amnt = col_integer(),
## funded_amnt = col_integer(),
## term = col_character(),
## int_rate = col_double(),
## emp_length = col_character(),
## home_ownership = col_character(),
## annual_inc = col_double(),
## verification_status = col_character(),
## loan_status = col_character(),
## purpose = col_character(),
## grade = col_character()
## )
#examine the variables with glimpse()
glimpse(lendingclub)
## Observations: 1,500
## Variables: 12
## $ member_id <int> 55096114, 1555332, 1009151, 69524202, 7212...
## $ loan_amnt <int> 11000, 10000, 13000, 5000, 18000, 14000, 8...
## $ funded_amnt <int> 11000, 10000, 13000, 5000, 18000, 14000, 8...
## $ term <chr> "36 months", "36 months", "60 months", "36...
## $ int_rate <dbl> 12.69, 6.62, 10.99, 12.05, 5.32, 16.99, 13...
## $ emp_length <chr> "10+ years", "10+ years", "3 years", "10+ ...
## $ home_ownership <chr> "RENT", "MORTGAGE", "MORTGAGE", "MORTGAGE"...
## $ annual_inc <dbl> 51000, 40000, 78204, 51000, 96000, 47000, ...
## $ verification_status <chr> "Not Verified", "Verified", "Not Verified"...
## $ loan_status <chr> "Current", "Fully Paid", "Fully Paid", "Cu...
## $ purpose <chr> "debt_consolidation", "debt_consolidation"...
## $ grade <chr> "C", "A", "B", "C", "A", "D", "C", "A", "D...
#find median loan_amt, mean int_rate, and mean annual_inc with summarise()
lendingclub %>% summarise(median(loan_amnt), mean(int_rate), mean(annual_inc))
## # A tibble: 1 x 3
## `median(loan_amnt)` `mean(int_rate)` `mean(annual_inc)`
## <dbl> <dbl> <dbl>
## 1 13000 13.3 75736.
# use ggplot2 to build a bar chart of purpose
ggplot(data=lendingclub, aes(x = purpose)) + geom_bar()
#use recode() to create the new purpose_recode variable.
lendingclub$purpose_recode <- lendingclub$purpose %>% recode(
"credit_card" = "debt_related",
"debt_consolidation" = "debt_related",
"medical" = "debt_related",
"car" = "big_purchase",
"major_purchase" = "big_purchase",
"vacation" = "big_purchase",
"moving" = "life_change",
"small_business" = "life_change",
"wedding" = "life_change",
"house" = "home_related",
"home_improvement" = "home_related"
)
#build a linear regression model, stored as purpose_recode_model
purpose_recode_model <- lm(funded_amnt ~ purpose_recode, data = lendingclub)
#look at results of purpose_recode_model
summary(purpose_recode_model)
##
## Call:
## lm(formula = funded_amnt ~ purpose_recode, data = lendingclub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14472 -6251 -1322 4678 25761
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9888.1 1248.9 7.917 4.69e-15 ***
## purpose_recodedebt_related 5433.5 1270.5 4.277 2.02e-05 ***
## purpose_recodehome_related 4845.0 1501.0 3.228 0.00127 **
## purpose_recodelife_change 4095.3 2197.2 1.864 0.06254 .
## purpose_recodeother -649.3 1598.3 -0.406 0.68461
## purpose_recoderenewable_energy -1796.4 4943.3 -0.363 0.71636
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8284 on 1494 degrees of freedom
## Multiple R-squared: 0.03473, Adjusted R-squared: 0.0315
## F-statistic: 10.75 on 5 and 1494 DF, p-value: 3.598e-10
#get anova results and save as purpose_recode_anova
purpose_recode_anova <- anova(purpose_recode_model)
# look at the class of purpose_recode_anova
class(purpose_recode_anova)
## [1] "anova" "data.frame"
#Use aov() to build purpose_recode_aov
purpose_recode_aov <- aov(funded_amnt ~ purpose_recode, data = lendingclub)
#Conduct Tukey's HSD test to create tukey_output
tukey_output <- TukeyHSD(purpose_recode_aov)
#tidy tukey_output to make sense of the results
broom::tidy(tukey_output)
## # A tibble: 15 x 6
## term comparison estimate conf.low conf.high adj.p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 purpose_r~ debt_related-big_pu~ 5434. 1808. 9059. 2.91e-4
## 2 purpose_r~ home_related-big_pu~ 4845. 562. 9128. 1.61e-2
## 3 purpose_r~ life_change-big_pur~ 4095. -2174. 10365. 4.25e-1
## 4 purpose_r~ other-big_purchase -649. -5210. 3911. 9.99e-1
## 5 purpose_r~ renewable_energy-bi~ -1796. -15902. 12309. 9.99e-1
## 6 purpose_r~ home_related-debt_r~ -589. -3056. 1879. 9.84e-1
## 7 purpose_r~ life_change-debt_re~ -1338. -6539. 3863. 9.78e-1
## 8 purpose_r~ other-debt_related -6083. -9005. -3160. 5.32e-8
## 9 purpose_r~ renewable_energy-de~ -7230. -20894. 6434. 6.58e-1
## 10 purpose_r~ life_change-home_re~ -750. -6429. 4929. 9.99e-1
## 11 purpose_r~ other-home_related -5494. -9201. -1787. 3.58e-4
## 12 purpose_r~ renewable_energy-ho~ -6641. -20494. 7212. 7.46e-1
## 13 purpose_r~ other-life_change -4745. -10636. 1147. 1.95e-1
## 14 purpose_r~ renewable_energy-li~ -5892. -20482. 8698. 8.59e-1
## 15 purpose_r~ renewable_energy-ot~ -1147. -15088. 12794. 10.00e-1
#Use aov() to build purpose_emp_aov
purpose_emp_aov <- aov(funded_amnt ~ purpose_recode + emp_length, data=lendingclub)
#print purpose_emp_aov to the console
purpose_emp_aov
## Call:
## aov(formula = funded_amnt ~ purpose_recode + emp_length, data = lendingclub)
##
## Terms:
## purpose_recode emp_length Residuals
## Sum of Squares 3688783338 2044273211 100488872355
## Deg. of Freedom 5 11 1483
##
## Residual standard error: 8231.679
## Estimated effects may be unbalanced
#call summary() to see the p-values
summary(purpose_emp_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## purpose_recode 5 3.689e+09 737756668 10.888 2.63e-10 ***
## emp_length 11 2.044e+09 185843019 2.743 0.00161 **
## Residuals 1483 1.005e+11 67760534
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#examine the summary of int_rate
summary(lendingclub$int_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.32 9.99 12.99 13.31 16.29 26.77
#examine int_rate by grade
lendingclub %>%
group_by(grade) %>%
summarise(mean = mean(int_rate), var = var(int_rate), median = median(int_rate))
## # A tibble: 7 x 4
## grade mean var median
## <chr> <dbl> <dbl> <dbl>
## 1 A 7.27 0.961 7.26
## 2 B 10.9 2.08 11.0
## 3 C 14.0 1.42 14.0
## 4 D 17.4 1.62 17.6
## 5 E 20.1 2.71 20.0
## 6 F 23.6 2.87 23.5
## 7 G 26.1 0.198 25.9
#make a boxplot of int_rate by grade
ggplot(lendingclub, aes(x = grade, y = int_rate)) + geom_boxplot()
#use aov() to create grade_aov plus call summary() to print results
grade_aov <- aov(int_rate ~ grade, data = lendingclub)
summary(grade_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## grade 6 27013 4502 2637 <2e-16 ***
## Residuals 1493 2549 2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#for a 2x2 grid of plots:
par(mfrow=c(2, 2))
#plot grade_aov
plot(grade_aov)
#back to defaults
par(mfrow=c(1, 1))
#Bartlett's test for homogeneity of variance
bartlett.test(int_rate ~ grade, data=lendingclub)
##
## Bartlett test of homogeneity of variances
##
## data: int_rate by grade
## Bartlett's K-squared = 78.549, df = 6, p-value = 7.121e-15
#use the correct function from pwr to find the sample size
pwr::pwr.t.test(n=NULL, d=0.2, sig.level=0.05,
type="two.sample", alternative="two.sided", power=0.8
)
##
## Two-sample t test power calculation
##
## n = 393.4057
## d = 0.2
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
lc_A <- c(11976148, 1203719, 54998739, 5801830, 31587242, 7711391, 54494666, 57663583, 8967787, 21760921, 44765721, 8596988, 5794746, 59501253, 10578432, 36058744, 11727607, 357888, 51936863, 1178593, 57315811, 5705168, 46024211, 12947039, 57345207, 55299831, 28763037, 49763149, 20077511, 60216198, 12295190, 1570287, 61408414, 59121340, 32349527, 5773180, 26899704, 55412161, 2217935, 16462713, 9196065, 27802028, 40949245, 56007625, 56935379, 62187473, 20178048, 604912, 58533358, 652594, 44066849, 38942161, 6414816, 65617953, 51816492, 43489983, 6794967, 42345315, 59532019, 13107597, 63249029, 7371829, 12335467, 8560739, 7337238, 887484, 23493355, 41031080, 60537197, 12816159, 38446687, 51026618, 6374688, 18685270, 296645, 44439325, 4915968, 63449566, 25256236, 63407874, 36753301, 20728660, 7937228, 13058684, 636359, 50527238, 40450502, 1018943, 12438198, 3065732, 1510626, 5764344, 37840363, 27460227, 39751366, 5028066, 43956700, 56109033, 1412622, 44289534, 41770436, 49956562, 44409121, 47168726, 60953428, 52189251, 64281487, 51928150, 1002880, 4537354, 12605849, 477843, 6808167, 38629237, 33311208, 36109419, 58593881, 40362979, 440300, 9848361, 30656060, 15691500, 4375269, 15360849, 7077904, 66076532, 33350264, 4175651, 44006939, 21130605, 54098234, 53192890, 7371114, 12967808, 58061230, 34803392, 5544911, 28843825, 63244663, 38504887, 68565204, 1211255, 63427670, 56472411, 10548622, 43957279, 59313014, 5768723, 66210490, 25507112, 55472659, 61339767, 65684813, 45544639, 43710238, 46833245, 13028661, 13167268, 3064642, 62072249, 27631726, 65825964, 15540990, 64320858, 8605358, 17795606, 9894584, 543619, 2380700, 20959552, 57743104, 63917130, 38480348, 61393540, 19916851)
lc_A <- c(lc_A, 12528162, 7264617, 61480809, 36411752, 20139228, 21290880, 390228, 45584424, 17755019, 23413261, 15490914, 1254285, 875004, 24274579, 51006600, 11458143, 5125832, 37802077, 57327243, 41059894, 64978360, 58683523, 4290736, 40919379, 65029207, 7096004, 42285591, 7388784, 65914238, 46833088, 21221678, 62855006, 10557733, 44915714, 23083224, 67289213, 9746670, 349608, 66610322, 1595886, 3635144, 38419356, 9715410, 9726377, 621152, 23213635, 18685424, 65782663, 57304429, 20770003, 8865120, 58664359, 1454540, 42404539, 60952405, 61339308, 7367648, 11215938, 41207320, 23553299, 1681376, 7617266, 30485630, 10604792, 46044414, 63094909, 59189668, 10106916, 52058386, 17763104, 6396213, 8981232, 48070364, 10615808, 11956507, 38444903, 60216940, 58310439, 10099562, 7504691, 17533228, 62236540, 38626163, 55657128, 7728107, 42415348, 42454693, 4777573, 23834164, 25157042, 1339435, 50587486, 55998961, 32950014, 28422748, 492346, 50607472, 11335041, 4254623, 65058537, 5375256, 5646680, 44430975, 4054992, 55253292, 68375791, 16822421, 64978226, 59859214, 65424555, 10112206, 6908772, 67879649, 4794842, 31227479, 17423361, 64049774, 58624386, 14829134, 50233873, 44389635, 29684724, 452267, 43044890, 55942742, 19516366, 34443897, 57135665, 34392172, 17352839, 12896521, 40451807, 43255228, 40372428, 8568706, 68364520, 3486848, 40991148, 19196658, 8658538, 65885614, 38352455, 65674149, 1029473, 39290483, 47420355, 65364529, 32318884, 13115811, 48484348, 65975356, 56129109, 3378980, 31026386, 55231010, 41113253, 1480114, 51406116, 2445051, 8627441, 60942818, 55453270, 58573102, 25767158, 9655554, 49783137, 42273770, 32038806, 681948, 65059359, 48546050, 20169281, 68546780, 7065575, 46387142, 66180493, 58430918, 1390497, 41950574, 39888056, 11774847, 55308824, 51969105, 7936525, 5960208, 7700566, 14529825, 14688918, 43024566, 21110140, 55797803, 31236439, 6817136, 1467168, 36028128, 60781310, 66595886, 57548184, 3194733, 8589175, 1546517, 17654773, 40572454, 63284984, 5780985, 39660177, 64050493, 55081623, 51346675, 1235123, 65633931, 66390924, 17413278, 57950994, 55911330, 11814853, 31357211, 56038385, 40038565, 64400706, 35034758, 60296238, 6527713, 5685238, 1062701, 63406447, 64008930, 63476297, 5114652, 20060374, 10085133, 61328568, 9435001, 56057656, 49934674, 39661404, 19616499, 34342717, 46653815, 45614269, 59290211, 31296803, 50605437, 46928301, 58562582, 63879452, 65733359, 51086476, 40601201, 9845217, 29213549, 41227222, 7337659, 46517072, 38610653, 9694813, 21350102, 46716202, 50535150, 39729407, 22263578, 25987787, 64913590, 19636684, 59311687, 4295372, 571012, 20588847, 63424767, 1099384, 3810242, 5604591, 39760687, 43739869, 56019939, 51526987, 45494853, 4302122, 21009984, 66210827, 67255219, 46613149, 63345017, 43570211, 62002161, 2214708, 4234697, 51055338, 19647002, 28593783, 6804647, 40542044, 42263319, 4784593, 19636686, 44015285, 55697847, 5814660, 15409525, 2307393, 54404433, 15490230, 62245810, 64969544, 48120716, 41040511, 51176224, 6376426, 60386775, 826517, 27601385, 8185587, 28564285, 68613325, 58623041, 60941473, 1635691, 7729270, 46417835, 57285778, 55960993, 66510262, 60285691, 61902329, 68565071)
lc_B <- c(62012715, 49974687, 27570947, 63417796, 61449107, 12906517, 57074291, 21021086, 404854, 15139172, 46774978, 50486061, 4305577, 65783354, 48544529, 31667129, 36980133, 19117791, 3845908, 846821, 40381968, 64018601, 57184860, 49963980, 44142706, 6327771, 20811335, 67336862, 3628833, 31247310, 4764984, 1619549, 56492219, 67959628, 61672211, 1472227, 55268407, 13497237, 57538143, 43096178, 35723158, 226780, 2307012, 1210773, 50273799, 28903599, 50839792, 44916418, 9714937, 51876659, 3919804, 12968154, 54978278, 6938022, 53854432, 63350177, 39692948, 67216234, 22253060, 59099446, 46135199, 11717805, 48596572, 8475061, 61462130, 21480483, 2014943, 41430440, 43196143, 243173, 61543762, 66562164, 67878273, 41100627, 11915326, 28753020, 12617369, 59090559, 55583726, 31256585, 544537, 61430245, 1681767, 7670078, 38506546, 36500594, 31367711, 46694948, 2080069, 38457330, 54524836, 27651989, 63358477, 62002922, 8995111, 45694307, 61470409, 17933815, 27370082, 66612753, 1536521, 54948920, 57548472, 876991, 40127147, 57365210, 1904740, 3195692, 743529, 67408356, 8766184, 23643466, 51336378, 13397002, 3700020, 49935259, 38455198, 63506356, 11386690, 32479126, 6300017, 67427011, 63344398, 51366616, 727247, 59291548, 21551336, 8776003, 16111335, 1051513, 61973285, 60764833, 59190150, 25406927, 10138072, 61361677, 32279884, 63337618, 49933340, 30565592, 3217416, 61883095, 63436296, 58290318, 29884855, 50353289, 14699170, 67625637, 6815821, 2286867, 6274586, 17853756, 55948157, 6995898, 44126015, 66643915, 41338910, 8626219, 67858810, 38597465, 45884338, 565018, 46436141, 15259622, 6594706, 39479497, 5535388, 5855546, 48734782, 2896555, 67296211, 713979, 33110251, 8987918, 1224687, 5637315, 484473, 9814600, 29694710, 60902260, 25897153, 40705483, 1439301, 3055155, 26319992, 6245002, 66441896, 46427698, 36330836, 8915199, 46205024, 62459417, 3497439, 54888931, 30475522, 38998249, 12636103, 60536957)
lc_B <- c(lc_B, 27521279, 2365984, 361549, 43430210, 35843833, 9768308, 12705933, 59179388, 60830121, 67929084, 36138408, 854552, 8865548, 13096420, 23836169, 61502149, 1621627, 11426617, 48274995, 41123011, 7296181, 29635336, 30565882, 8145149, 46116481, 21119590, 43894290, 65866235, 44143687, 873468, 12419378, 26378681, 55140334, 56964922, 61682200, 14338072, 65047247, 57267246, 59581503, 41093708, 48524124, 513842, 1685090, 42723216, 60647576, 55341080, 9735578, 41110083, 30255415, 56010965, 63214550, 67828966, 671468, 38540004, 65107371, 18645038, 26017706, 660734, 573283, 9454644, 64017354, 617449, 7645594, 43286428, 55941273, 8636865, 31226902, 46194753, 6160505, 1412225, 65741544, 24084859, 58532795, 41880754, 45515321, 60585561, 65272380, 7937327, 1489732, 17553239, 7638498, 1473206, 38162164, 3355990, 15610681, 57025137, 6254978, 38162571, 52768311, 5938741, 58101279, 18895673, 30175739, 38222417, 55909312, 65663878, 6607837, 24725076, 61722475, 11895058, 28182084, 185962, 55259655, 16241080, 66602227, 5781939, 60801476, 6996130, 12346893, 65672013, 19076244, 1475379, 9056893, 59492895, 56864322, 60942704, 44015940, 62225220, 39739191, 66435524, 44199929, 59471139, 38547168, 6205030, 38615829, 6698930, 66514563, 1623685, 60545969, 46703319, 39739315, 12636426, 65364691, 16403147, 9204637, 19306532, 66270322, 65653692, 22313524, 59082682, 19796545, 10766253, 50436003, 49363132, 27600713, 44865530, 57763719, 47857115, 48535477, 65986020, 58603818, 42934257, 1167844, 66390187, 58281312, 63888770, 48596526, 67385135, 24775459, 55090096, 12347068, 37317537, 64007908, 1683908, 11976597, 41019342, 6855113, 7964638, 65701227, 44037648, 23133074, 9787718, 61389384, 38418035, 33130454, 13038119, 14639242, 38505864, 65725266, 62904623, 68513661, 36039498, 6538734, 51857455, 59139740, 64341225, 21430833, 55455899, 17795459, 65128493, 46428798, 43216120, 59199242, 50364311, 41079485, 27711293, 63218354, 65492649, 50819365, 40737432, 377507, 65736437, 61488876, 44886450, 31467727, 46651816, 11914779, 65352381, 24726593, 52989922, 43105128, 34322310, 8669148, 12795739, 38485516, 39559934, 4280915, 63437401, 7103037, 44946049, 15400322, 28583975, 59592185, 877645, 56019484, 3372858, 60556772, 19846532, 11658194, 6894823, 61414862, 52708301, 48806212, 12204849, 60863986, 3919883, 37661631, 47210580, 14689912, 23393084, 60961679, 6170889, 55191727, 14690280, 42415518, 65855022, 62156039, 38536464, 44603544, 63527328, 48182146, 25867085, 61952845, 4744682, 20110370, 65854766, 57722242, 11438361, 34111919, 53262232, 12247443, 64210396, 37630339, 41237564, 46722148, 65791211, 16882760, 7719304, 37622016, 3220774, 51906280, 12446784, 50064210, 57733299, 63437152, 38445791, 3730324, 56052115, 57354312, 58010576, 626701, 7224706, 64079786, 62167132, 8396526, 7625377, 12707224, 35084508, 56022111, 52027979, 43215589, 50425264, 59253209, 28312549, 67376619, 30795837, 43869662, 20849433, 55351366, 39549686, 22972745, 1025579)
# The specific member IDs in lc_A and lc_B are not in dataset lendingclub
lendingclub_ab <- lendingclub %>%
mutate(Group=ifelse(member_id %in% lc_A, "A", ifelse(member_id %in% lc_B, "B", "C")))
# ggplot(lendingclub_ab, aes(x=Group, y=loan_amnt)) + geom_boxplot()
#conduct a two-sided t-test
# t.test(loan_amnt ~ Group, data=lendingclub_ab)
#build lendingclub_multi
# lendingclub_multi <-lm(loan_amnt ~ Group + grade + verification_status, data=lendingclub_ab)
#examine lendingclub_multi results
# broom::tidy(lendingclub_multi)
Chapter 3 - Randomized Complete (and Balanced Incomplete) Block Designs
Intro to NHANES Dataset and Sampling:
Randomized Complete Block Designs (RCBD):
Balanced Incomplete Block Designs (BIBD):
Example code includes:
nhanes_demo <- readr::read_csv("./RInputFiles/nhanes_demo.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## wtint2yr = col_double(),
## wtmec2yr = col_double(),
## indfmpir = col_double()
## )
## See spec(...) for full column specifications.
nhanes_medical <- readr::read_csv("./RInputFiles/nhanes_medicalconditions.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## mcq230d = col_character(),
## mcq240b = col_character(),
## mcq240c = col_character(),
## mcq240d = col_character(),
## mcq240dk = col_character(),
## mcq240h = col_character(),
## mcq240i = col_character(),
## mcq240k = col_character(),
## mcq240q = col_character(),
## mcq240r = col_character(),
## mcq240s = col_character(),
## mcq240v = col_character(),
## mcq240y = col_character()
## )
## See spec(...) for full column specifications.
nhanes_bodymeasures <- readr::read_csv("./RInputFiles/nhanes_bodymeasures.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## seqn = col_integer(),
## bmdstats = col_integer(),
## bmiwt = col_integer(),
## bmirecum = col_integer(),
## bmihead = col_character(),
## bmiht = col_integer(),
## bmdbmic = col_integer(),
## bmileg = col_integer(),
## bmiarml = col_integer(),
## bmiarmc = col_integer(),
## bmiwaist = col_integer(),
## bmdsadcm = col_integer()
## )
## See spec(...) for full column specifications.
dummy_nhanes_final <- readr::read_csv("./RInputFiles/nhanes_final.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## wtint2yr = col_double(),
## wtmec2yr = col_double(),
## indfmpir = col_double(),
## mcq230d = col_character(),
## mcq240b = col_character(),
## mcq240c = col_character(),
## mcq240d = col_character(),
## mcq240dk = col_character(),
## mcq240h = col_character(),
## mcq240i = col_character(),
## mcq240k = col_character(),
## mcq240q = col_character(),
## mcq240r = col_character(),
## mcq240s = col_character(),
## mcq240v = col_character(),
## mcq240y = col_character(),
## bmxwt = col_double(),
## bmxrecum = col_double(),
## bmxhead = col_character(),
## bmihead = col_character()
## # ... with 11 more columns
## )
## See spec(...) for full column specifications.
#merge the 3 datasets you just created to create nhanes_combined
nhanes_combined <- list(nhanes_demo, nhanes_medical, nhanes_bodymeasures) %>%
Reduce(function(df1, df2) inner_join(df1, df2, by="seqn"), .)
#fill in the dplyr code
nhanes_combined %>% group_by(mcq365d) %>% summarise(mean = mean(bmxwt, na.rm = TRUE))
## # A tibble: 4 x 2
## mcq365d mean
## <int> <dbl>
## 1 1 90.7
## 2 2 76.5
## 3 9 90.8
## 4 NA 33.5
#fill in the ggplot2 code
nhanes_combined %>% filter(ridageyr > 16) %>%
ggplot(aes(x=as.factor(mcq365d), y=bmxwt)) +
geom_boxplot()
## Warning: Removed 70 rows containing non-finite values (stat_boxplot).
#filter out anyone less than 16
nhanes_filter <- nhanes_combined %>% filter(ridageyr > 16)
#use simputation & impute bmxwt to fill in missing values
nhanes_final <- simputation::impute_median(nhanes_filter, bmxwt ~ riagendr)
#recode mcq365d with ifelse() & examine with table()
nhanes_final$mcq365d <- ifelse(nhanes_final$mcq365d==9, 2, nhanes_final$mcq365d)
table(nhanes_final$mcq365d)
##
## 1 2
## 1802 4085
#use sample() to create nhanes_srs
nhanes_srs <- nhanes_final[sample(nrow(nhanes_final), 2500), ]
#create nhanes_stratified with group_by() and sample_n()
nhanes_stratified <- nhanes_final %>%
group_by(riagendr) %>%
sample_n(2000)
table(nhanes_stratified$riagendr)
##
## 1 2
## 2000 2000
#load sampling package and create nhanes_cluster with cluster()
nhanes_cluster <- sampling::cluster(nhanes_final, "indhhin2", 6, method = "srswor")
#use str() to view design.rcbd's criteria
str(agricolae::design.rcbd)
## function (trt, r, serie = 2, seed = 0, kinds = "Super-Duper", first = TRUE,
## continue = FALSE, randomization = TRUE)
#build trt and rep
trt <- LETTERS[1:5]
rep <- 4
#Use trt and rep to build my.design.rcbd and view the sketch part of the object
my_design_rcbd <- agricolae::design.rcbd(trt, r=rep, seed = 42, serie=0)
my_design_rcbd$sketch
## [,1] [,2] [,3] [,4] [,5]
## [1,] "D" "E" "A" "C" "B"
## [2,] "B" "C" "A" "E" "D"
## [3,] "C" "D" "A" "E" "B"
## [4,] "A" "C" "B" "D" "E"
#make nhanes_final$riagendr a factor variable
nhanes_final$riagendr <- factor(nhanes_final$riagendr)
#use aov() to create nhanes_rcbd
nhanes_rcbd <- aov(bmxwt ~ mcq365d + riagendr, data=nhanes_final)
#check the results of nhanes_rcbd with summary()
summary(nhanes_rcbd)
## Df Sum Sq Mean Sq F value Pr(>F)
## mcq365d 1 228651 228651 568.8 <2e-16 ***
## riagendr 1 159706 159706 397.3 <2e-16 ***
## Residuals 5884 2365187 402
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#print the difference in weights by mcq365d and riagendr
nhanes_final %>% group_by(mcq365d, riagendr) %>% summarise(mean_wt = mean(bmxwt))
## # A tibble: 4 x 3
## # Groups: mcq365d [?]
## mcq365d riagendr mean_wt
## <dbl> <fct> <dbl>
## 1 1 1 95.1
## 2 1 2 86.7
## 3 2 1 82.6
## 4 2 2 71.3
#set up the 2x2 plotting grid and then plot nhanes_rcbd
par(mfrow=c(2, 2))
plot(nhanes_rcbd)
par(mfrow=c(1, 1))
#run the code to view the interaction plots
with(nhanes_final, interaction.plot(mcq365d, riagendr, bmxwt))
#run the code to view the interaction plots
with(nhanes_final, interaction.plot(riagendr, mcq365d, bmxwt))
#create my_design_bibd_1
# my_design_bibd_1 <- design.bib(LETTERS[1:3], k = 4, r = 16, serie = 0, seed = 42) # will throw an error
#create my_design_bibd_2
# my_design_bibd_2 <- design.bib(letters[1:2], k = 3, r = 5, serie = 0, seed = 42) # will throw warning
#create my_design_bibd_3
my_design_bibd_3 <- agricolae::design.bib(letters[1:4], k = 4, r = 6, serie = 0, seed = 42)
##
## Parameters BIB
## ==============
## Lambda : 6
## treatmeans : 4
## Block size : 4
## Blocks : 6
## Replication: 6
##
## Efficiency factor 1
##
## <<< Book >>>
my_design_bibd_3$sketch
## [,1] [,2] [,3] [,4]
## [1,] "d" "b" "a" "c"
## [2,] "d" "c" "b" "a"
## [3,] "c" "d" "b" "a"
## [4,] "a" "b" "d" "c"
## [5,] "b" "d" "a" "c"
## [6,] "a" "b" "d" "c"
lambda <- function(t, k, r){
return((r*(k-1)) / (t-1))
}
#calculate lambda
lambda(4, 3, 3)
## [1] 2
#build the data.frame
creatinine <- c(1.98, 1.97, 2.35, 2.09, 1.87, 1.95, 2.08, 2.01, 1.84, 2.06, 1.97, 2.22)
food <- as.factor(c("A", "C", "D", "A", "B", "C", "B", "C", "D", "A", "B", "D"))
color <- as.factor(rep(c("Black", "White", "Orange", "Spotted"), each = 3))
cat_experiment <- as.data.frame(cbind(creatinine, food, color))
#create cat_model & then wrong_cat_model and examine them with summary()
cat_model <- aov(creatinine ~ food + color, data=cat_experiment)
summary(cat_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## food 1 0.01204 0.012042 0.530 0.485
## color 1 0.00697 0.006971 0.307 0.593
## Residuals 9 0.20461 0.022735
#calculate lambda
lambda(3, 3, 2)
## [1] 2
#create weightlift_model & examine results (variable does not exist in dataset)
# weightlift_model <- aov(bmxarmc ~ weightlift_treat + ridreth1, data=nhanes_final)
# summary(weightlift_model)
Chapter 4 - Latin Squares, Graeco-Latin Squares, Factorial Experiments
Latin Squares have two blocking factors, assumed not to interact with each other or the treatment, and each with the same number of levels:
Graeco-Latin Squares builds on Latin squares by adding an additional blocking factor:
Factorial Experiments - designs in which 2+ variables are crossed in an experiment, with each combination considered a factor:
Next steps:
Example code includes:
nyc_scores <- readr::read_csv("./RInputFiles/nyc_scores.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## Zip_Code = col_integer(),
## Latitude = col_double(),
## Longitude = col_double(),
## Start_Time = col_time(format = ""),
## End_Time = col_time(format = ""),
## Student_Enrollment = col_integer(),
## Percent_White = col_double(),
## Percent_Black = col_double(),
## Percent_Hispanic = col_double(),
## Percent_Asian = col_double(),
## Average_Score_SAT_Math = col_integer(),
## Average_Score_SAT_Reading = col_integer(),
## Average_Score_SAT_Writing = col_integer(),
## Percent_Tested = col_double()
## )
## See spec(...) for full column specifications.
glimpse(nyc_scores)
## Observations: 435
## Variables: 22
## $ School_ID <chr> "02M260", "06M211", "01M539", "02M29...
## $ School_Name <chr> "Clinton School Writers and Artists"...
## $ Borough <chr> "Manhattan", "Manhattan", "Manhattan...
## $ Building_Code <chr> "M933", "M052", "M022", "M445", "M44...
## $ Street_Address <chr> "425 West 33rd Street", "650 Academy...
## $ City <chr> "Manhattan", "Manhattan", "Manhattan...
## $ State <chr> "NY", "NY", "NY", "NY", "NY", "NY", ...
## $ Zip_Code <int> 10001, 10002, 10002, 10002, 10002, 1...
## $ Latitude <dbl> 40.75321, 40.86605, 40.71873, 40.716...
## $ Longitude <dbl> -73.99786, -73.92486, -73.97943, -73...
## $ Phone_Number <chr> "212-695-9114", "718-935-3660", "212...
## $ Start_Time <time> NA, 08:30:00, 08:15:00, 08:00...
## $ End_Time <time> NA, 15:00:00, 16:00:00, 14:45...
## $ Student_Enrollment <int> NA, 87, 1735, 358, 383, 416, 255, 54...
## $ Percent_White <dbl> NA, 0.03, 0.29, 0.12, 0.03, 0.02, 0....
## $ Percent_Black <dbl> NA, 0.22, 0.13, 0.39, 0.28, 0.03, 0....
## $ Percent_Hispanic <dbl> NA, 0.68, 0.18, 0.41, 0.57, 0.06, 0....
## $ Percent_Asian <dbl> NA, 0.05, 0.39, 0.06, 0.09, 0.89, 0....
## $ Average_Score_SAT_Math <int> NA, NA, 657, 395, 418, 613, 410, 634...
## $ Average_Score_SAT_Reading <int> NA, NA, 601, 411, 428, 453, 406, 641...
## $ Average_Score_SAT_Writing <int> NA, NA, 601, 387, 415, 463, 381, 639...
## $ Percent_Tested <dbl> NA, NA, 0.91, 0.79, 0.65, 0.96, 0.60...
tEL <- c('PhD', 'BA', 'BA', 'MA', 'MA', 'PhD', 'MA', 'MA', 'BA', 'PhD', 'College Student', 'College Student', 'Grad Student', 'MA', 'MA', 'MA', 'BA', 'MA', 'BA', 'MA', 'College Student', 'PhD', 'MA', 'MA', 'BA', 'MA', 'College Student', 'BA', 'PhD', 'Grad Student', 'MA', 'Grad Student', 'MA', 'College Student', 'Grad Student', 'MA', 'Grad Student', 'BA', 'BA', 'College Student', 'Grad Student', 'College Student', 'BA', 'BA', 'PhD', 'BA', 'Grad Student', 'Grad Student', 'College Student', 'College Student', 'BA', 'PhD', 'College Student', 'PhD', 'PhD', 'PhD', 'College Student', 'Grad Student', 'MA', 'MA', 'BA', 'PhD', 'College Student', 'MA', 'MA', 'College Student', 'Grad Student', 'MA', 'PhD', 'MA', 'College Student', 'MA', 'PhD', 'MA', 'College Student', 'College Student', 'Grad Student', 'PhD', 'MA', 'MA', 'Grad Student', 'MA', 'MA', 'Grad Student', 'PhD', 'Grad Student', 'Grad Student', 'Grad Student', 'MA', 'PhD', 'BA', 'MA', 'Grad Student', 'BA', 'College Student', 'MA', 'College Student', 'Grad Student', 'Grad Student', 'College Student', 'MA', 'BA', 'BA', 'MA', 'MA', 'Grad Student', 'MA', 'Grad Student', 'MA', 'Grad Student', 'College Student', 'College Student', 'College Student', 'MA', 'BA', 'Grad Student', 'Grad Student', 'MA', 'College Student', 'BA', 'Grad Student', 'MA', 'Grad Student', 'PhD', 'MA', 'MA', 'College Student', 'MA', 'College Student', 'PhD', 'College Student', 'MA', 'MA', 'MA', 'MA', 'College Student', 'MA', 'BA', 'MA', 'Grad Student', 'BA', 'MA', 'MA', 'Grad Student', 'MA', 'MA', 'College Student', 'MA', 'MA', 'BA', 'MA', 'College Student', 'Grad Student', 'College Student', 'MA', 'BA', 'MA', 'BA', 'College Student', 'Grad Student', 'Grad Student', 'Grad Student', 'Grad Student', 'Grad Student', 'MA', 'BA', 'MA', 'BA', 'College Student', 'MA', 'BA', 'MA', 'Grad Student', 'MA', 'PhD', 'MA', 'BA', 'Grad Student', 'MA', 'BA', 'BA', 'MA', 'BA', 'College Student', 'BA', 'MA', 'MA', 'BA', 'MA', 'College Student', 'BA', 'Grad Student', 'MA', 'BA', 'MA', 'MA', 'MA', 'BA', 'College Student', 'College Student')
tEL <- c(tEL, 'BA', 'Grad Student', 'BA', 'BA', 'MA', 'Grad Student', 'BA', 'MA', 'BA', 'PhD', 'MA', 'MA', 'MA', 'BA', 'College Student', 'PhD', 'BA', 'Grad Student', 'BA', 'College Student', 'BA', 'MA', 'College Student', 'MA', 'College Student', 'Grad Student', 'College Student', 'MA', 'PhD', 'BA', 'PhD', 'Grad Student', 'BA', 'BA', 'MA', 'MA', 'BA', 'PhD', 'College Student', 'MA', 'BA', 'College Student', 'BA', 'MA', 'College Student', 'MA', 'College Student', 'BA', 'MA', 'BA', 'BA', 'MA', 'PhD', 'BA', 'MA', 'Grad Student', 'College Student', 'MA', 'College Student', 'MA', 'MA', 'PhD', 'College Student', 'College Student', 'Grad Student', 'Grad Student', 'MA', 'College Student', 'Grad Student', 'Grad Student', 'Grad Student', 'MA', 'Grad Student', 'MA', 'BA', 'College Student', 'MA', 'Grad Student', 'College Student', 'MA', 'BA', 'BA', 'College Student', 'College Student', 'College Student', 'College Student', 'College Student', 'PhD', 'MA', 'College Student', 'MA', 'MA', 'MA', 'PhD', 'College Student', 'College Student', 'MA', 'MA', 'MA', 'PhD', 'MA', 'MA', 'PhD', 'MA', 'Grad Student', 'MA', 'Grad Student', 'MA', 'Grad Student', 'MA', 'MA', 'PhD', 'BA', 'BA', 'Grad Student', 'Grad Student', 'PhD', 'BA', 'BA', 'Grad Student', 'College Student', 'BA', 'College Student', 'MA', 'MA', 'MA', 'Grad Student', 'BA', 'BA', 'MA', 'Grad Student', 'PhD', 'BA', 'Grad Student', 'Grad Student', 'Grad Student', 'BA', 'MA', 'BA', 'College Student', 'College Student', 'Grad Student', 'MA', 'Grad Student', 'Grad Student', 'BA', 'BA', 'MA', 'College Student', 'BA', 'Grad Student', 'Grad Student', 'College Student', 'Grad Student', 'College Student', 'PhD', 'BA', 'MA', 'MA', 'BA', 'College Student', 'College Student', 'PhD', 'MA', 'BA', 'MA', 'MA', 'Grad Student', 'MA', 'PhD', 'MA', 'MA', 'Grad Student', 'College Student', 'MA', 'BA', 'BA', 'College Student', 'Grad Student', 'BA', 'MA', 'MA', 'Grad Student', 'BA', 'Grad Student', 'Grad Student', 'MA', 'PhD', 'Grad Student', 'Grad Student', 'MA', 'MA', 'PhD', 'College Student', 'College Student', 'MA', 'BA', 'MA', 'College Student', 'MA', 'PhD', 'BA', 'MA', 'College Student', 'PhD', 'PhD', 'College Student', 'MA', 'MA', 'MA', 'PhD', 'MA', 'BA', 'College Student', 'BA', 'BA', 'MA', 'MA', 'College Student', 'College Student', 'Grad Student', 'College Student', 'MA', 'MA', 'MA', 'Grad Student', 'MA', 'College Student', 'Grad Student', 'BA', 'Grad Student', 'BA', 'MA', 'College Student', 'MA')
nyc_scores <- nyc_scores %>%
mutate(Teacher_Education_Level=tEL)
glimpse(nyc_scores)
## Observations: 435
## Variables: 23
## $ School_ID <chr> "02M260", "06M211", "01M539", "02M29...
## $ School_Name <chr> "Clinton School Writers and Artists"...
## $ Borough <chr> "Manhattan", "Manhattan", "Manhattan...
## $ Building_Code <chr> "M933", "M052", "M022", "M445", "M44...
## $ Street_Address <chr> "425 West 33rd Street", "650 Academy...
## $ City <chr> "Manhattan", "Manhattan", "Manhattan...
## $ State <chr> "NY", "NY", "NY", "NY", "NY", "NY", ...
## $ Zip_Code <int> 10001, 10002, 10002, 10002, 10002, 1...
## $ Latitude <dbl> 40.75321, 40.86605, 40.71873, 40.716...
## $ Longitude <dbl> -73.99786, -73.92486, -73.97943, -73...
## $ Phone_Number <chr> "212-695-9114", "718-935-3660", "212...
## $ Start_Time <time> NA, 08:30:00, 08:15:00, 08:00...
## $ End_Time <time> NA, 15:00:00, 16:00:00, 14:45...
## $ Student_Enrollment <int> NA, 87, 1735, 358, 383, 416, 255, 54...
## $ Percent_White <dbl> NA, 0.03, 0.29, 0.12, 0.03, 0.02, 0....
## $ Percent_Black <dbl> NA, 0.22, 0.13, 0.39, 0.28, 0.03, 0....
## $ Percent_Hispanic <dbl> NA, 0.68, 0.18, 0.41, 0.57, 0.06, 0....
## $ Percent_Asian <dbl> NA, 0.05, 0.39, 0.06, 0.09, 0.89, 0....
## $ Average_Score_SAT_Math <int> NA, NA, 657, 395, 418, 613, 410, 634...
## $ Average_Score_SAT_Reading <int> NA, NA, 601, 411, 428, 453, 406, 641...
## $ Average_Score_SAT_Writing <int> NA, NA, 601, 387, 415, 463, 381, 639...
## $ Percent_Tested <dbl> NA, NA, 0.91, 0.79, 0.65, 0.96, 0.60...
## $ Teacher_Education_Level <chr> "PhD", "BA", "BA", "MA", "MA", "PhD"...
#mean, var, and median of Math score
nyc_scores %>%
group_by(Borough) %>%
summarise(mean = mean(Average_Score_SAT_Math, na.rm=TRUE),
var = var(Average_Score_SAT_Math, na.rm=TRUE),
median = median(Average_Score_SAT_Math, na.rm=TRUE))
## # A tibble: 5 x 4
## Borough mean var median
## <chr> <dbl> <dbl> <dbl>
## 1 Bronx 404. 2727. 396.
## 2 Brooklyn 416. 3658. 395
## 3 Manhattan 456. 7026. 433
## 4 Queens 462. 5168. 448
## 5 Staten Island 486. 6911. 466.
#mean, var, and median of Math score
nyc_scores %>%
group_by(Teacher_Education_Level) %>%
summarise(mean = mean(Average_Score_SAT_Math, na.rm=TRUE),
var = var(Average_Score_SAT_Math, na.rm=TRUE),
median = median(Average_Score_SAT_Math, na.rm=TRUE))
## # A tibble: 5 x 4
## Teacher_Education_Level mean var median
## <chr> <dbl> <dbl> <dbl>
## 1 BA 438. 5536. 418
## 2 College Student 424. 4807. 400.
## 3 Grad Student 437. 7071. 410
## 4 MA 432. 4304. 415
## 5 PhD 435. 4869. 420.
#mean, var, and median of Math score
nyc_scores %>%
group_by(Borough, Teacher_Education_Level) %>%
summarise(mean = mean(Average_Score_SAT_Math, na.rm=TRUE),
var = var(Average_Score_SAT_Math, na.rm=TRUE),
median = median(Average_Score_SAT_Math, na.rm=TRUE))
## # A tibble: 24 x 5
## # Groups: Borough [?]
## Borough Teacher_Education_Level mean var median
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Bronx BA 428. 7446. 407
## 2 Bronx College Student 384. 493. 382
## 3 Bronx Grad Student 400. 1776. 397
## 4 Bronx MA 401. 901. 395
## 5 Bronx PhD 400. 2468. 386
## 6 Brooklyn BA 425. 4005. 405
## 7 Brooklyn College Student 396. 3035. 382
## 8 Brooklyn Grad Student 436. 5756. 408
## 9 Brooklyn MA 414. 2401. 395
## 10 Brooklyn PhD 402 1868. 394.
## # ... with 14 more rows
# If we want to use SAT scores as our outcome, we need to examine their missingness
# First, look at the pattern of missingness using md.pattern() from the mice package
# There are 60 scores missing in each of the scores
# There are many R packages which help with more advanced forms of imputation, such as MICE, Amelia, mi, and more
# We will use the simputation andimpute_median() as we did previously
#examine missingness with md.pattern()
mice::md.pattern(nyc_scores)
## School_ID School_Name Borough Building_Code Street_Address City State
## 374 1 1 1 1 1 1 1
## 11 1 1 1 1 1 1 1
## 42 1 1 1 1 1 1 1
## 4 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0
## Zip_Code Latitude Longitude Phone_Number Teacher_Education_Level
## 374 1 1 1 1 1
## 11 1 1 1 1 1
## 42 1 1 1 1 1
## 4 1 1 1 1 1
## 1 1 1 1 1 1
## 3 1 1 1 1 1
## 0 0 0 0 0
## Start_Time End_Time Student_Enrollment Percent_White Percent_Black
## 374 1 1 1 1 1
## 11 1 1 1 1 1
## 42 1 1 1 1 1
## 4 1 1 0 0 0
## 1 0 0 1 1 1
## 3 0 0 0 0 0
## 4 4 7 7 7
## Percent_Hispanic Percent_Asian Percent_Tested Average_Score_SAT_Math
## 374 1 1 1 1
## 11 1 1 1 0
## 42 1 1 0 0
## 4 0 0 0 0
## 1 1 1 1 1
## 3 0 0 0 0
## 7 7 49 60
## Average_Score_SAT_Reading Average_Score_SAT_Writing
## 374 1 1 0
## 11 0 0 3
## 42 0 0 4
## 4 0 0 9
## 1 1 1 2
## 3 0 0 11
## 60 60 272
#impute the Math, Writing, and Reading scores by Borough
nyc_scores_2 <- simputation::impute_median(nyc_scores, Average_Score_SAT_Math ~ Borough)
#convert Math score to numeric
nyc_scores_2$Average_Score_SAT_Math <- as.numeric(nyc_scores_2$Average_Score_SAT_Math)
#examine scores by Borough in both datasets, before and after imputation
nyc_scores %>%
group_by(Borough) %>%
summarise(median = median(Average_Score_SAT_Math, na.rm = TRUE), mean = mean(Average_Score_SAT_Math, na.rm = TRUE))
## # A tibble: 5 x 3
## Borough median mean
## <chr> <dbl> <dbl>
## 1 Bronx 396. 404.
## 2 Brooklyn 395 416.
## 3 Manhattan 433 456.
## 4 Queens 448 462.
## 5 Staten Island 466. 486.
nyc_scores_2 %>%
group_by(Borough) %>%
summarise(median = median(Average_Score_SAT_Math, na.rm = TRUE), mean = mean(Average_Score_SAT_Math, na.rm = TRUE))
## # A tibble: 5 x 3
## Borough median mean
## <chr> <dbl> <dbl>
## 1 Bronx 395 403.
## 2 Brooklyn 399 418.
## 3 Manhattan 418 446.
## 4 Queens 448 460.
## 5 Staten Island 466. 486.
#design a LS with 5 treatments A:E then look at the sketch
my_design_lsd <- agricolae::design.lsd(LETTERS[1:5], serie=0, seed=42)
my_design_lsd$sketch
## [,1] [,2] [,3] [,4] [,5]
## [1,] "B" "E" "D" "A" "C"
## [2,] "A" "D" "C" "E" "B"
## [3,] "E" "C" "B" "D" "A"
## [4,] "C" "A" "E" "B" "D"
## [5,] "D" "B" "A" "C" "E"
# To execute a Latin Square design on this data, suppose we want to know the effect of of our tutoring program, which includes one-on-one tutoring, two small groups, and an in and after school SAT prep class
# A new dataset nyc_scores_ls is available that represents this experiment. Feel free to explore the dataset in the console.
# We'll block by Borough and Teacher_Education_Level to reduce their known variance on the score outcome
# Borough is a good blocking factor because schools in America are funded partly based on taxes paid in each city, so it will likely make a difference on quality of education
lsID <- c('11X290', '10X342', '09X260', '09X412', '12X479', '14K478', '32K554', '14K685', '22K405', '17K382', '05M692', '02M427', '02M308', '03M402', '02M282', '30Q501', '26Q495', '24Q455', '29Q326', '25Q670', '31R450', '31R445', '31R080', '31R460', '31R455')
lsTP <- c('One-on-One', 'Small Groups (2-3)', 'Small Groups (4-6)', 'SAT Prep Class (school hours)', 'SAT Prep Class (after school)', 'Small Groups (2-3)', 'Small Groups (4-6)', 'SAT Prep Class (school hours)', 'SAT Prep Class (after school)', 'One-on-One', 'Small Groups (4-6)', 'SAT Prep Class (school hours)', 'SAT Prep Class (after school)', 'One-on-One', 'Small Groups (2-3)', 'SAT Prep Class (school hours)', 'SAT Prep Class (after school)', 'One-on-One', 'Small Groups (2-3)', 'Small Groups (4-6)', 'SAT Prep Class (after school)', 'One-on-One', 'Small Groups (2-3)', 'Small Groups (4-6)', 'SAT Prep Class (school hours)')
nyc_scores_ls <- nyc_scores_2 %>%
filter(School_ID %in% lsID) %>%
mutate(Tutoring_Program=lsTP)
#build nyc_scores_ls_lm
nyc_scores_ls_lm <- lm(Average_Score_SAT_Math ~ Tutoring_Program + Borough + Teacher_Education_Level,
data=nyc_scores_ls
)
#tidy the results with broom
nyc_scores_ls_lm %>% broom::tidy()
## # A tibble: 13 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 389. 71.6 5.44 1.50e-4
## 2 Tutoring_ProgramSAT Prep Class (a~ -11.2 50.4 -0.223 8.27e-1
## 3 Tutoring_ProgramSAT Prep Class (s~ 19.3 64.2 0.301 7.69e-1
## 4 Tutoring_ProgramSmall Groups (2-3) -41.3 59.4 -0.696 5.00e-1
## 5 Tutoring_ProgramSmall Groups (4-6) -47.4 50.4 -0.941 3.65e-1
## 6 BoroughBrooklyn 66.5 60.3 1.10 2.92e-1
## 7 BoroughManhattan 52.2 53.8 0.970 3.51e-1
## 8 BoroughQueens 88.3 59.9 1.47 1.67e-1
## 9 BoroughStaten Island 54.1 49.9 1.09 2.99e-1
## 10 Teacher_Education_LevelCollege St~ 29.1 82.2 0.354 7.29e-1
## 11 Teacher_Education_LevelGrad Stude~ 79.7 65.2 1.22 2.45e-1
## 12 Teacher_Education_LevelMA 16.2 47.1 0.343 7.37e-1
## 13 Teacher_Education_LevelPhD -70.7 103. -0.686 5.06e-1
#examine the results with anova
nyc_scores_ls_lm %>% anova()
## Analysis of Variance Table
##
## Response: Average_Score_SAT_Math
## Df Sum Sq Mean Sq F value Pr(>F)
## Tutoring_Program 4 41411 10352.8 1.7582 0.2021
## Borough 4 13297 3324.3 0.5646 0.6931
## Teacher_Education_Level 4 16734 4183.5 0.7105 0.6003
## Residuals 12 70658 5888.2
#create a boxplot of Math scores by Borough, with a title and x/y axis labels
ggplot(nyc_scores, aes(x=Borough, y=Average_Score_SAT_Math)) +
geom_boxplot() +
ggtitle("Average SAT Math Scores by Borough, NYC") +
xlab("Borough (NYC)") +
ylab("Average SAT Math Scores (2014-15)")
## Warning: Removed 60 rows containing non-finite values (stat_boxplot).
#create trt1 and trt2
trt1 <- LETTERS[1:5]
trt2 <- 1:5
#create my_graeco_design
my_graeco_design <- agricolae::design.graeco(trt1, trt2, serie=0, seed=42)
#examine the parameters and sketch
my_graeco_design$parameters
## $design
## [1] "graeco"
##
## $trt1
## [1] "A" "B" "C" "D" "E"
##
## $trt2
## [1] 1 2 3 4 5
##
## $r
## [1] 5
##
## $serie
## [1] 0
##
## $seed
## [1] 42
##
## $kinds
## [1] "Super-Duper"
##
## [[8]]
## [1] TRUE
my_graeco_design$sketch
## [,1] [,2] [,3] [,4] [,5]
## [1,] "D 2" "E 3" "A 1" "C 5" "B 4"
## [2,] "E 1" "A 5" "C 4" "B 2" "D 3"
## [3,] "A 4" "C 2" "B 3" "D 1" "E 5"
## [4,] "C 3" "B 1" "D 5" "E 4" "A 2"
## [5,] "B 5" "D 4" "E 2" "A 3" "C 1"
glsID <- c('09X241', '10X565', '09X260', '07X259', '11X455', '18K563', '23K697', '32K403', '22K425', '16K688', '02M135', '06M348', '02M419', '02M489', '04M495', '30Q502', '24Q530', '30Q555', '24Q560', '27Q650', '31R440', '31R064', '31R450', '31R445', '31R460')
glsTP <- c('SAT Prep Class (school hours)', 'SAT Prep Class (after school)', 'One-on-One', 'Small Groups (4-6)', 'Small Groups (2-3)', 'SAT Prep Class (after school)', 'One-on-One', 'Small Groups (4-6)', 'Small Groups (2-3)', 'SAT Prep Class (school hours)', 'One-on-One', 'Small Groups (4-6)', 'Small Groups (2-3)', 'SAT Prep Class (school hours)', 'SAT Prep Class (after school)', 'Small Groups (4-6)', 'Small Groups (2-3)', 'SAT Prep Class (school hours)', 'SAT Prep Class (after school)', 'One-on-One', 'Small Groups (2-3)', 'SAT Prep Class (school hours)', 'SAT Prep Class (after school)', 'One-on-One', 'Small Groups (4-6)')
glsHT <- c('Small Group', 'Large Group', 'Individual', 'Mix of Large Group/Individual', 'Mix of Small Group/Individual', 'Individual', 'Mix of Large Group/Individual', 'Mix of Small Group/Individual', 'Small Group', 'Large Group', 'Mix of Small Group/Individual', 'Small Group', 'Large Group', 'Individual', 'Mix of Large Group/Individual', 'Large Group', 'Individual', 'Mix of Large Group/Individual', 'Mix of Small Group/Individual', 'Small Group', 'Mix of Large Group/Individual', 'Mix of Small Group/Individual', 'Small Group', 'Large Group', 'Individual')
nyc_scores_gls <- nyc_scores_2 %>%
filter(School_ID %in% glsID) %>%
mutate(Tutoring_Program=glsTP, Homework_Type=glsHT)
#build nyc_scores_gls_lm
nyc_scores_gls_lm <- lm(Average_Score_SAT_Math ~ Tutoring_Program + Borough + Teacher_Education_Level + Homework_Type, data=nyc_scores_gls)
#tidy the results with broom
nyc_scores_gls_lm %>% broom::tidy()
## # A tibble: 17 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 366. 27.7 13.2 1.01e-6
## 2 Tutoring_ProgramSAT Prep Class ~ 48.6 23.4 2.08 7.13e-2
## 3 Tutoring_ProgramSAT Prep Class ~ 31.9 25.3 1.26 2.42e-1
## 4 Tutoring_ProgramSmall Groups (2~ 37.0 23.5 1.57 1.55e-1
## 5 Tutoring_ProgramSmall Groups (4~ 63.3 23.6 2.68 2.81e-2
## 6 BoroughBrooklyn -20.6 24.8 -0.828 4.32e-1
## 7 BoroughManhattan 8.16 24.1 0.339 7.44e-1
## 8 BoroughQueens 42.9 21.4 2.01 7.96e-2
## 9 BoroughStaten Island 26.8 19.7 1.36 2.12e-1
## 10 Teacher_Education_LevelCollege ~ -4.76 26.3 -0.181 8.61e-1
## 11 Teacher_Education_LevelGrad Stu~ 37.0 34.5 1.07 3.15e-1
## 12 Teacher_Education_LevelMA 16.8 19.7 0.852 4.19e-1
## 13 Teacher_Education_LevelPhD 13.8 31.3 0.443 6.70e-1
## 14 Homework_TypeLarge Group 5.25 22.2 0.236 8.19e-1
## 15 Homework_TypeMix of Large Group~ -7.21 22.8 -0.317 7.60e-1
## 16 Homework_TypeMix of Small Group~ 12.4 21.5 0.575 5.81e-1
## 17 Homework_TypeSmall Group 56.0 24.4 2.30 5.04e-2
#examine the results with anova
nyc_scores_gls_lm %>% anova()
## Analysis of Variance Table
##
## Response: Average_Score_SAT_Math
## Df Sum Sq Mean Sq F value Pr(>F)
## Tutoring_Program 4 18568.4 4642.1 5.1436 0.02384 *
## Borough 4 5321.8 1330.5 1.4742 0.29623
## Teacher_Education_Level 4 3449.6 862.4 0.9556 0.48085
## Homework_Type 4 10153.2 2538.3 2.8125 0.09956 .
## Residuals 8 7220.0 902.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pctTHL <- c(1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2)
pctBHL <- c(2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1)
tP <- c('Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'No')
nyc_scores <- nyc_scores %>%
select(-Teacher_Education_Level) %>%
mutate(Percent_Tested_HL=factor(pctTHL), Percent_Black_HL=factor(pctBHL), Tutoring_Program=factor(tP))
#build the boxplots for all 3 factor variables: tutoring program, pct black, pct tested
ggplot(nyc_scores, aes(x=Tutoring_Program, y=Average_Score_SAT_Math)) +
geom_boxplot()
## Warning: Removed 60 rows containing non-finite values (stat_boxplot).
ggplot(nyc_scores, aes(x=Percent_Black_HL, y=Average_Score_SAT_Math)) +
geom_boxplot()
## Warning: Removed 60 rows containing non-finite values (stat_boxplot).
ggplot(nyc_scores, aes(x=Percent_Tested_HL, y=Average_Score_SAT_Math)) +
geom_boxplot()
## Warning: Removed 60 rows containing non-finite values (stat_boxplot).
#create nyc_scores_factorial and examine the results
nyc_scores_factorial <- aov(Average_Score_SAT_Math ~ Percent_Tested_HL * Percent_Black_HL * Tutoring_Program, data=nyc_scores)
broom::tidy(nyc_scores_factorial)
## # A tibble: 8 x 6
## term df sumsq meansq statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Percent_Tested_HL 1 3.92e5 3.92e5 116. 1.21e-23
## 2 Percent_Black_HL 1 1.87e5 1.87e5 55.3 7.52e-13
## 3 Tutoring_Program 1 9.42e3 9.42e3 2.78 9.63e- 2
## 4 Percent_Tested_HL:Percent_Blac~ 1 8.88e4 8.88e4 26.2 4.94e- 7
## 5 Percent_Tested_HL:Tutoring_Pro~ 1 1.98e3 1.98e3 0.584 4.45e- 1
## 6 Percent_Black_HL:Tutoring_Prog~ 1 5.16e3 5.16e3 1.52 2.18e- 1
## 7 Percent_Tested_HL:Percent_Blac~ 1 7.77e3 7.77e3 2.29 1.31e- 1
## 8 Residuals 367 1.24e6 3.39e3 NA NA
#use shapiro.test() to test the outcome
shapiro.test(nyc_scores$Average_Score_SAT_Math)
##
## Shapiro-Wilk normality test
##
## data: nyc_scores$Average_Score_SAT_Math
## W = 0.84672, p-value < 2.2e-16
#plot nyc_scores_factorial to examine residuals
par(mfrow = c(2, 2))
plot(nyc_scores_factorial)
par(mfrow = c(1, 1))
Chapter 1 - One-Factor Models
Model Specification - Structural Equation Models (SEM) - explore relationships between variables:
Model Analysis:
Model Assessment:
Example code includes:
#Load the lavaan library
library(lavaan)
## This is lavaan 0.6-2
## lavaan is BETA software! Please report any bugs.
#Look at the dataset
data(HolzingerSwineford1939, package="lavaan")
head(HolzingerSwineford1939[ , 7:15])
## x1 x2 x3 x4 x5 x6 x7 x8 x9
## 1 3.333333 7.75 0.375 2.333333 5.75 1.2857143 3.391304 5.75 6.361111
## 2 5.333333 5.25 2.125 1.666667 3.00 1.2857143 3.782609 6.25 7.916667
## 3 4.500000 5.25 1.875 1.000000 1.75 0.4285714 3.260870 3.90 4.416667
## 4 5.333333 7.75 3.000 2.666667 4.50 2.4285714 3.000000 5.30 4.861111
## 5 4.833333 4.75 0.875 2.666667 4.00 2.5714286 3.695652 6.30 5.916667
## 6 5.333333 5.00 2.250 1.000000 3.00 0.8571429 4.347826 6.65 7.500000
#Define your model specification
text.model <- "textspeed =~ x4 + x5 + x6 + x7 + x8 + x9"
#Analyze the model with cfa()
text.fit <- lavaan::cfa(model=text.model, data=HolzingerSwineford1939)
#Summarize the model
summary(text.fit)
## lavaan 0.6-2 ended normally after 20 iterations
##
## Optimization method NLMINB
## Number of free parameters 12
##
## Number of observations 301
##
## Estimator ML
## Model Fit Test Statistic 149.786
## Degrees of freedom 9
## P-value (Chi-square) 0.000
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|)
## textspeed =~
## x4 1.000
## x5 1.130 0.067 16.946 0.000
## x6 0.925 0.056 16.424 0.000
## x7 0.196 0.067 2.918 0.004
## x8 0.186 0.062 2.984 0.003
## x9 0.279 0.062 4.539 0.000
##
## Variances:
## Estimate Std.Err z-value P(>|z|)
## .x4 0.383 0.048 7.903 0.000
## .x5 0.424 0.059 7.251 0.000
## .x6 0.368 0.044 8.419 0.000
## .x7 1.146 0.094 12.217 0.000
## .x8 0.988 0.081 12.215 0.000
## .x9 0.940 0.077 12.142 0.000
## textspeed 0.968 0.112 8.647 0.000
summary(text.fit, standardized=TRUE)
## lavaan 0.6-2 ended normally after 20 iterations
##
## Optimization method NLMINB
## Number of free parameters 12
##
## Number of observations 301
##
## Estimator ML
## Model Fit Test Statistic 149.786
## Degrees of freedom 9
## P-value (Chi-square) 0.000
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## textspeed =~
## x4 1.000 0.984 0.846
## x5 1.130 0.067 16.946 0.000 1.112 0.863
## x6 0.925 0.056 16.424 0.000 0.910 0.832
## x7 0.196 0.067 2.918 0.004 0.193 0.177
## x8 0.186 0.062 2.984 0.003 0.183 0.181
## x9 0.279 0.062 4.539 0.000 0.275 0.273
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .x4 0.383 0.048 7.903 0.000 0.383 0.284
## .x5 0.424 0.059 7.251 0.000 0.424 0.256
## .x6 0.368 0.044 8.419 0.000 0.368 0.308
## .x7 1.146 0.094 12.217 0.000 1.146 0.969
## .x8 0.988 0.081 12.215 0.000 0.988 0.967
## .x9 0.940 0.077 12.142 0.000 0.940 0.926
## textspeed 0.968 0.112 8.647 0.000 1.000 1.000
summary(text.fit, fit.measures=TRUE)
## lavaan 0.6-2 ended normally after 20 iterations
##
## Optimization method NLMINB
## Number of free parameters 12
##
## Number of observations 301
##
## Estimator ML
## Model Fit Test Statistic 149.786
## Degrees of freedom 9
## P-value (Chi-square) 0.000
##
## Model test baseline model:
##
## Minimum Function Test Statistic 681.336
## Degrees of freedom 15
## P-value 0.000
##
## User model versus baseline model:
##
## Comparative Fit Index (CFI) 0.789
## Tucker-Lewis Index (TLI) 0.648
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -2476.130
## Loglikelihood unrestricted model (H1) -2401.237
##
## Number of free parameters 12
## Akaike (AIC) 4976.261
## Bayesian (BIC) 5020.746
## Sample-size adjusted Bayesian (BIC) 4982.689
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.228
## 90 Percent Confidence Interval 0.197 0.261
## P-value RMSEA <= 0.05 0.000
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.148
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|)
## textspeed =~
## x4 1.000
## x5 1.130 0.067 16.946 0.000
## x6 0.925 0.056 16.424 0.000
## x7 0.196 0.067 2.918 0.004
## x8 0.186 0.062 2.984 0.003
## x9 0.279 0.062 4.539 0.000
##
## Variances:
## Estimate Std.Err z-value P(>|z|)
## .x4 0.383 0.048 7.903 0.000
## .x5 0.424 0.059 7.251 0.000
## .x6 0.368 0.044 8.419 0.000
## .x7 1.146 0.094 12.217 0.000
## .x8 0.988 0.081 12.215 0.000
## .x9 0.940 0.077 12.142 0.000
## textspeed 0.968 0.112 8.647 0.000
#Look at the dataset
data(PoliticalDemocracy, package="lavaan")
head(PoliticalDemocracy)
## y1 y2 y3 y4 y5 y6 y7 y8
## 1 2.50 0.000000 3.333333 0.000000 1.250000 0.000000 3.726360 3.333333
## 2 1.25 0.000000 3.333333 0.000000 6.250000 1.100000 6.666666 0.736999
## 3 7.50 8.800000 9.999998 9.199991 8.750000 8.094061 9.999998 8.211809
## 4 8.90 8.800000 9.999998 9.199991 8.907948 8.127979 9.999998 4.615086
## 5 10.00 3.333333 9.999998 6.666666 7.500000 3.333333 9.999998 6.666666
## 6 7.50 3.333333 6.666666 6.666666 6.250000 1.100000 6.666666 0.368500
## x1 x2 x3
## 1 4.442651 3.637586 2.557615
## 2 5.384495 5.062595 3.568079
## 3 5.961005 6.255750 5.224433
## 4 6.285998 7.567863 6.267495
## 5 5.863631 6.818924 4.573679
## 6 5.533389 5.135798 3.892270
#Define your model specification
politics.model <- "poldemo60 =~ y1 + y2 + y3 + y4"
#Analyze the model with cfa()
politics.fit <- lavaan::cfa(model = politics.model, data = PoliticalDemocracy)
#Summarize the model
summary(politics.fit, standardized=TRUE, fit.measures=TRUE)
## lavaan 0.6-2 ended normally after 26 iterations
##
## Optimization method NLMINB
## Number of free parameters 8
##
## Number of observations 75
##
## Estimator ML
## Model Fit Test Statistic 10.006
## Degrees of freedom 2
## P-value (Chi-square) 0.007
##
## Model test baseline model:
##
## Minimum Function Test Statistic 159.183
## Degrees of freedom 6
## P-value 0.000
##
## User model versus baseline model:
##
## Comparative Fit Index (CFI) 0.948
## Tucker-Lewis Index (TLI) 0.843
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -704.138
## Loglikelihood unrestricted model (H1) -699.135
##
## Number of free parameters 8
## Akaike (AIC) 1424.275
## Bayesian (BIC) 1442.815
## Sample-size adjusted Bayesian (BIC) 1417.601
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.231
## 90 Percent Confidence Interval 0.103 0.382
## P-value RMSEA <= 0.05 0.014
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.046
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## poldemo60 =~
## y1 1.000 2.133 0.819
## y2 1.404 0.197 7.119 0.000 2.993 0.763
## y3 1.089 0.167 6.529 0.000 2.322 0.712
## y4 1.370 0.167 8.228 0.000 2.922 0.878
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .y1 2.239 0.512 4.371 0.000 2.239 0.330
## .y2 6.412 1.293 4.960 0.000 6.412 0.417
## .y3 5.229 0.990 5.281 0.000 5.229 0.492
## .y4 2.530 0.765 3.306 0.001 2.530 0.229
## poldemo60 4.548 1.106 4.112 0.000 1.000 1.000
Chapter 2 - Multi-Factor Models
Multifactor Specification - exploring multiple latent relationships, and their relationships to each other:
Model Structure:
Modification Indices:
Model Comparison:
Example code includes:
#Create your text model specification
text.model <- 'text =~ x4 + x5 + x6'
#Analyze the model
text.fit <- cfa(model=text.model, data=HolzingerSwineford1939)
#Summarize the model
summary(text.fit, standardized = TRUE, fit.measures = TRUE)
## lavaan 0.6-2 ended normally after 15 iterations
##
## Optimization method NLMINB
## Number of free parameters 6
##
## Number of observations 301
##
## Estimator ML
## Model Fit Test Statistic 0.000
## Degrees of freedom 0
##
## Model test baseline model:
##
## Minimum Function Test Statistic 497.430
## Degrees of freedom 3
## P-value 0.000
##
## User model versus baseline model:
##
## Comparative Fit Index (CFI) 1.000
## Tucker-Lewis Index (TLI) 1.000
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -1181.065
## Loglikelihood unrestricted model (H1) -1181.065
##
## Number of free parameters 6
## Akaike (AIC) 2374.130
## Bayesian (BIC) 2396.372
## Sample-size adjusted Bayesian (BIC) 2377.344
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.000
## 90 Percent Confidence Interval 0.000 0.000
## P-value RMSEA <= 0.05 NA
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.000
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## text =~
## x4 1.000 0.984 0.847
## x5 1.133 0.067 16.906 0.000 1.115 0.866
## x6 0.924 0.056 16.391 0.000 0.910 0.832
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .x4 0.382 0.049 7.805 0.000 0.382 0.283
## .x5 0.416 0.059 7.038 0.000 0.416 0.251
## .x6 0.369 0.044 8.367 0.000 0.369 0.308
## text 0.969 0.112 8.640 0.000 1.000 1.000
#Update the model specification by setting two paths to the label a
text.model <- 'text =~ x4 + a*x5 + a*x6'
#Analyze the model
text.fit <- cfa(model = text.model, data = HolzingerSwineford1939)
#Summarize the model
summary(text.fit, standardized = TRUE, fit.measures = TRUE)
## lavaan 0.6-2 ended normally after 14 iterations
##
## Optimization method NLMINB
## Number of free parameters 6
## Number of equality constraints 1
##
## Number of observations 301
##
## Estimator ML
## Model Fit Test Statistic 11.227
## Degrees of freedom 1
## P-value (Chi-square) 0.001
##
## Model test baseline model:
##
## Minimum Function Test Statistic 497.430
## Degrees of freedom 3
## P-value 0.000
##
## User model versus baseline model:
##
## Comparative Fit Index (CFI) 0.979
## Tucker-Lewis Index (TLI) 0.938
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -1186.678
## Loglikelihood unrestricted model (H1) -1181.065
##
## Number of free parameters 5
## Akaike (AIC) 2383.357
## Bayesian (BIC) 2401.892
## Sample-size adjusted Bayesian (BIC) 2386.035
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.184
## 90 Percent Confidence Interval 0.098 0.288
## P-value RMSEA <= 0.05 0.007
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.073
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## text =~
## x4 1.000 0.983 0.846
## x5 (a) 1.009 0.054 18.747 0.000 0.992 0.815
## x6 (a) 1.009 0.054 18.747 0.000 0.992 0.866
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .x4 0.383 0.050 7.631 0.000 0.383 0.284
## .x5 0.499 0.054 9.164 0.000 0.499 0.336
## .x6 0.328 0.045 7.285 0.000 0.328 0.250
## text 0.967 0.113 8.585 0.000 1.000 1.000
#Create a two-factor model of text and speed variables
twofactor.model <- 'text =~ x4 + x5 + x6
speed =~ x7 + x8 + x9'
#Previous one-factor model output
summary(text.fit, standardized = TRUE, fit.measures = TRUE)
## lavaan 0.6-2 ended normally after 14 iterations
##
## Optimization method NLMINB
## Number of free parameters 6
## Number of equality constraints 1
##
## Number of observations 301
##
## Estimator ML
## Model Fit Test Statistic 11.227
## Degrees of freedom 1
## P-value (Chi-square) 0.001
##
## Model test baseline model:
##
## Minimum Function Test Statistic 497.430
## Degrees of freedom 3
## P-value 0.000
##
## User model versus baseline model:
##
## Comparative Fit Index (CFI) 0.979
## Tucker-Lewis Index (TLI) 0.938
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -1186.678
## Loglikelihood unrestricted model (H1) -1181.065
##
## Number of free parameters 5
## Akaike (AIC) 2383.357
## Bayesian (BIC) 2401.892
## Sample-size adjusted Bayesian (BIC) 2386.035
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.184
## 90 Percent Confidence Interval 0.098 0.288
## P-value RMSEA <= 0.05 0.007
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.073
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## text =~
## x4 1.000 0.983 0.846
## x5 (a) 1.009 0.054 18.747 0.000 0.992 0.815
## x6 (a) 1.009 0.054 18.747 0.000 0.992 0.866
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .x4 0.383 0.050 7.631 0.000 0.383 0.284
## .x5 0.499 0.054 9.164 0.000 0.499 0.336
## .x6 0.328 0.045 7.285 0.000 0.328 0.250
## text 0.967 0.113 8.585 0.000 1.000 1.000
#Two-factor model specification
twofactor.model <- 'text =~ x4 + x5 + x6
speed =~ x7 + x8 + x9'
#Use cfa() to analyze the model
twofactor.fit <- cfa(model=twofactor.model, data=HolzingerSwineford1939)
#Use summary() to view the fitted model
summary(twofactor.fit, standardized = TRUE, fit.measures = TRUE)
## lavaan 0.6-2 ended normally after 24 iterations
##
## Optimization method NLMINB
## Number of free parameters 13
##
## Number of observations 301
##
## Estimator ML
## Model Fit Test Statistic 14.354
## Degrees of freedom 8
## P-value (Chi-square) 0.073
##
## Model test baseline model:
##
## Minimum Function Test Statistic 681.336
## Degrees of freedom 15
## P-value 0.000
##
## User model versus baseline model:
##
## Comparative Fit Index (CFI) 0.990
## Tucker-Lewis Index (TLI) 0.982
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -2408.414
## Loglikelihood unrestricted model (H1) -2401.237
##
## Number of free parameters 13
## Akaike (AIC) 4842.828
## Bayesian (BIC) 4891.021
## Sample-size adjusted Bayesian (BIC) 4849.792
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.051
## 90 Percent Confidence Interval 0.000 0.093
## P-value RMSEA <= 0.05 0.425
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.039
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## text =~
## x4 1.000 0.984 0.847
## x5 1.132 0.067 16.954 0.000 1.114 0.865
## x6 0.925 0.056 16.438 0.000 0.911 0.833
## speed =~
## x7 1.000 0.674 0.619
## x8 1.150 0.165 6.990 0.000 0.775 0.766
## x9 0.878 0.123 7.166 0.000 0.592 0.587
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## text ~~
## speed 0.173 0.052 3.331 0.001 0.261 0.261
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .x4 0.382 0.049 7.854 0.000 0.382 0.283
## .x5 0.418 0.059 7.113 0.000 0.418 0.252
## .x6 0.367 0.044 8.374 0.000 0.367 0.307
## .x7 0.729 0.084 8.731 0.000 0.729 0.616
## .x8 0.422 0.084 5.039 0.000 0.422 0.413
## .x9 0.665 0.071 9.383 0.000 0.665 0.655
## text 0.969 0.112 8.647 0.000 1.000 1.000
## speed 0.454 0.096 4.728 0.000 1.000 1.000
#Load the library and data
data(epi, package="psych")
#Specify a three-factor model with one correlation set to zero
epi.model <- 'extraversion =~ V1 + V3 + V5 + V8
neuroticism =~ V2 + V4 + V7 + V9
lying =~ V6 + V12 + V18 + V24
extraversion ~~ 0*neuroticism'
#Run the model
epi.fit <- cfa(model = epi.model, data = epi)
#Examine the output
summary(epi.fit, standardized = TRUE, fit.measures = TRUE)
## lavaan 0.6-2 ended normally after 118 iterations
##
## Optimization method NLMINB
## Number of free parameters 26
##
## Used Total
## Number of observations 3193 3570
##
## Estimator ML
## Model Fit Test Statistic 584.718
## Degrees of freedom 52
## P-value (Chi-square) 0.000
##
## Model test baseline model:
##
## Minimum Function Test Statistic 2196.019
## Degrees of freedom 66
## P-value 0.000
##
## User model versus baseline model:
##
## Comparative Fit Index (CFI) 0.750
## Tucker-Lewis Index (TLI) 0.683
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -23208.145
## Loglikelihood unrestricted model (H1) -22915.787
##
## Number of free parameters 26
## Akaike (AIC) 46468.291
## Bayesian (BIC) 46626.077
## Sample-size adjusted Bayesian (BIC) 46543.464
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.057
## 90 Percent Confidence Interval 0.053 0.061
## P-value RMSEA <= 0.05 0.004
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.058
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## extraversion =~
## V1 1.000 0.052 0.115
## V3 1.360 0.329 4.127 0.000 0.070 0.141
## V5 -2.829 0.554 -5.109 0.000 -0.146 -0.391
## V8 7.315 1.832 3.992 0.000 0.377 0.797
## neuroticism =~
## V2 1.000 0.228 0.457
## V4 0.424 0.053 8.004 0.000 0.097 0.196
## V7 1.395 0.093 15.023 0.000 0.318 0.648
## V9 1.205 0.078 15.506 0.000 0.275 0.553
## lying =~
## V6 1.000 0.135 0.272
## V12 -0.851 0.132 -6.435 0.000 -0.115 -0.291
## V18 -0.785 0.122 -6.421 0.000 -0.106 -0.289
## V24 1.086 0.161 6.734 0.000 0.147 0.339
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## extraversion ~~
## neuroticism 0.000 0.000 0.000
## lying -0.002 0.001 -3.313 0.001 -0.258 -0.258
## neuroticism ~~
## lying -0.014 0.002 -6.867 0.000 -0.469 -0.469
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .V1 0.198 0.005 39.567 0.000 0.198 0.987
## .V3 0.243 0.006 39.278 0.000 0.243 0.980
## .V5 0.118 0.005 23.900 0.000 0.118 0.847
## .V8 0.082 0.026 3.084 0.002 0.082 0.364
## .V2 0.197 0.006 32.516 0.000 0.197 0.791
## .V4 0.235 0.006 38.906 0.000 0.235 0.962
## .V7 0.140 0.007 19.412 0.000 0.140 0.580
## .V9 0.172 0.006 26.591 0.000 0.172 0.694
## .V6 0.228 0.007 34.520 0.000 0.228 0.926
## .V12 0.143 0.004 33.670 0.000 0.143 0.916
## .V18 0.124 0.004 33.753 0.000 0.124 0.917
## .V24 0.166 0.005 31.021 0.000 0.166 0.885
## extraversion 0.003 0.001 2.480 0.013 1.000 1.000
## neuroticism 0.052 0.005 10.010 0.000 1.000 1.000
## lying 0.018 0.004 4.500 0.000 1.000 1.000
#Specify a three-factor model where lying is predicted by neuroticism
epi.model <- 'extraversion =~ V1 + V3 + V5 + V8
neuroticism =~ V2 + V4 + V7 + V9
lying =~ V6 + V12 + V18 + V24
lying ~ neuroticism'
#Run the model
epi.fit <- cfa(model = epi.model, data = epi)
#Examine the output
summary(epi.fit, standardized = TRUE, fit.measures = TRUE)
## lavaan 0.6-2 ended normally after 120 iterations
##
## Optimization method NLMINB
## Number of free parameters 26
##
## Used Total
## Number of observations 3193 3570
##
## Estimator ML
## Model Fit Test Statistic 534.426
## Degrees of freedom 52
## P-value (Chi-square) 0.000
##
## Model test baseline model:
##
## Minimum Function Test Statistic 2196.019
## Degrees of freedom 66
## P-value 0.000
##
## User model versus baseline model:
##
## Comparative Fit Index (CFI) 0.774
## Tucker-Lewis Index (TLI) 0.713
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -23183.000
## Loglikelihood unrestricted model (H1) -22915.787
##
## Number of free parameters 26
## Akaike (AIC) 46417.999
## Bayesian (BIC) 46575.786
## Sample-size adjusted Bayesian (BIC) 46493.173
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.054
## 90 Percent Confidence Interval 0.050 0.058
## P-value RMSEA <= 0.05 0.058
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.053
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## extraversion =~
## V1 1.000 0.052 0.115
## V3 1.135 0.268 4.230 0.000 0.059 0.118
## V5 -2.497 0.443 -5.638 0.000 -0.129 -0.346
## V8 8.223 2.008 4.096 0.000 0.425 0.898
## neuroticism =~
## V2 1.000 0.223 0.447
## V4 0.462 0.054 8.493 0.000 0.103 0.209
## V7 1.435 0.093 15.368 0.000 0.320 0.652
## V9 1.214 0.078 15.570 0.000 0.271 0.545
## lying =~
## V6 1.000 0.125 0.252
## V12 -0.943 0.150 -6.274 0.000 -0.118 -0.298
## V18 -0.905 0.143 -6.339 0.000 -0.113 -0.308
## V24 1.187 0.182 6.509 0.000 0.148 0.342
##
## Regressions:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## lying ~
## neuroticism -0.298 0.043 -6.943 0.000 -0.532 -0.532
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## extraversion ~~
## neuroticism 0.003 0.001 3.761 0.000 0.240 0.240
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .V1 0.198 0.005 39.671 0.000 0.198 0.987
## .V3 0.244 0.006 39.651 0.000 0.244 0.986
## .V5 0.123 0.004 28.256 0.000 0.123 0.881
## .V8 0.043 0.033 1.302 0.193 0.043 0.193
## .V2 0.200 0.006 33.262 0.000 0.200 0.800
## .V4 0.233 0.006 38.804 0.000 0.233 0.956
## .V7 0.139 0.007 20.087 0.000 0.139 0.575
## .V9 0.174 0.006 27.907 0.000 0.174 0.703
## .V6 0.231 0.007 35.398 0.000 0.231 0.936
## .V12 0.143 0.004 33.349 0.000 0.143 0.911
## .V18 0.122 0.004 32.825 0.000 0.122 0.905
## .V24 0.166 0.005 30.854 0.000 0.166 0.883
## extraversion 0.003 0.001 2.643 0.008 1.000 1.000
## neuroticism 0.050 0.005 9.947 0.000 1.000 1.000
## .lying 0.011 0.003 3.970 0.000 0.717 0.717
#Calculate the variance of V1
var(epi$V1, na.rm=TRUE)
## [1] 0.2018335
#Examine the modification indices
modificationindices(epi.fit, sort=TRUE)
## lhs op rhs mi epc sepc.lv sepc.all sepc.nox
## 39 neuroticism =~ V3 169.990 -0.704 -0.157 -0.316 -0.316
## 38 neuroticism =~ V1 125.582 0.545 0.122 0.271 0.271
## 47 lying =~ V3 119.773 1.294 0.162 0.325 0.325
## 69 V3 ~~ V7 77.456 -0.034 -0.034 -0.186 -0.186
## 57 V1 ~~ V2 76.872 0.033 0.033 0.164 0.164
## 46 lying =~ V1 64.865 -0.857 -0.107 -0.239 -0.239
## 58 V1 ~~ V4 27.288 0.020 0.020 0.093 0.093
## 34 extraversion =~ V6 24.201 -0.948 -0.049 -0.099 -0.099
## 121 neuroticism ~~ lying 23.184 0.020 0.854 0.854 0.854
## 120 extraversion ~~ lying 23.184 -0.001 -0.205 -0.205 -0.205
## 122 neuroticism ~ lying 23.184 1.798 1.008 1.008 1.008
## 102 V4 ~~ V12 18.360 0.015 0.015 0.079 0.079
## 88 V8 ~~ V6 17.629 -0.016 -0.016 -0.161 -0.161
## 49 lying =~ V8 17.324 -0.641 -0.080 -0.170 -0.170
## 66 V3 ~~ V8 16.886 0.042 0.042 0.411 0.411
## 74 V3 ~~ V24 16.332 0.015 0.015 0.075 0.075
## 31 extraversion =~ V4 15.808 0.781 0.040 0.082 0.082
## 50 lying =~ V2 15.345 0.713 0.089 0.179 0.179
## 56 V1 ~~ V8 14.184 -0.035 -0.035 -0.373 -0.373
## 112 V9 ~~ V18 13.078 0.011 0.011 0.076 0.076
## 45 neuroticism =~ V24 10.674 0.319 0.071 0.164 0.164
## 65 V3 ~~ V5 10.297 -0.011 -0.011 -0.064 -0.064
## 67 V3 ~~ V2 9.800 -0.013 -0.013 -0.058 -0.058
## 83 V5 ~~ V24 9.658 0.008 0.008 0.058 0.058
## 85 V8 ~~ V4 9.532 0.012 0.012 0.118 0.118
## 98 V2 ~~ V24 8.780 0.011 0.011 0.060 0.060
## 115 V6 ~~ V18 8.093 0.010 0.010 0.062 0.062
## 63 V1 ~~ V18 7.944 0.008 0.008 0.052 0.052
## 116 V6 ~~ V24 7.436 0.012 0.012 0.063 0.063
## 84 V8 ~~ V2 7.279 -0.010 -0.010 -0.112 -0.112
## 110 V9 ~~ V6 7.179 0.011 0.011 0.055 0.055
## 70 V3 ~~ V9 7.091 -0.011 -0.011 -0.052 -0.052
## 60 V1 ~~ V9 6.550 0.009 0.009 0.050 0.050
## 73 V3 ~~ V18 6.533 -0.008 -0.008 -0.047 -0.047
## 30 extraversion =~ V2 6.090 -0.487 -0.025 -0.050 -0.050
## 44 neuroticism =~ V18 5.972 0.186 0.042 0.113 0.113
## 51 lying =~ V4 5.278 -0.411 -0.051 -0.104 -0.104
## 33 extraversion =~ V9 4.903 -0.446 -0.023 -0.046 -0.046
## 87 V8 ~~ V9 4.841 -0.009 -0.009 -0.099 -0.099
## 59 V1 ~~ V7 4.588 0.007 0.007 0.045 0.045
## 68 V3 ~~ V4 4.293 0.009 0.009 0.037 0.037
## 106 V7 ~~ V6 4.219 -0.009 -0.009 -0.048 -0.048
## 93 V2 ~~ V7 4.179 0.014 0.014 0.082 0.082
## 75 V5 ~~ V8 3.508 0.056 0.056 0.775 0.775
## 99 V4 ~~ V7 3.431 -0.009 -0.009 -0.048 -0.048
## 95 V2 ~~ V6 3.106 0.007 0.007 0.034 0.034
## 55 V1 ~~ V5 2.977 0.005 0.005 0.034 0.034
## 94 V2 ~~ V9 2.860 0.009 0.009 0.051 0.051
## 35 extraversion =~ V12 2.777 0.256 0.013 0.033 0.033
## 37 extraversion =~ V24 2.668 -0.275 -0.014 -0.033 -0.033
## 48 lying =~ V5 2.453 0.155 0.019 0.052 0.052
## 109 V7 ~~ V24 2.021 0.005 0.005 0.035 0.035
## 89 V8 ~~ V12 1.910 0.004 0.004 0.054 0.054
## 91 V8 ~~ V24 1.733 -0.004 -0.004 -0.052 -0.052
## 86 V8 ~~ V7 1.640 0.005 0.005 0.068 0.068
## 80 V5 ~~ V6 1.631 0.004 0.004 0.023 0.023
## 97 V2 ~~ V18 1.600 -0.004 -0.004 -0.025 -0.025
## 82 V5 ~~ V18 1.301 0.003 0.003 0.021 0.021
## 54 V1 ~~ V3 1.227 -0.004 -0.004 -0.020 -0.020
## 100 V4 ~~ V9 1.198 -0.005 -0.005 -0.024 -0.024
## 64 V1 ~~ V24 1.153 0.004 0.004 0.020 0.020
## 53 lying =~ V9 0.995 -0.189 -0.024 -0.048 -0.048
## 41 neuroticism =~ V8 0.823 0.163 0.036 0.077 0.077
## 62 V1 ~~ V12 0.814 0.003 0.003 0.016 0.016
## 90 V8 ~~ V18 0.797 0.003 0.003 0.035 0.035
## 76 V5 ~~ V2 0.714 0.003 0.003 0.016 0.016
## 117 V12 ~~ V18 0.698 -0.003 -0.003 -0.019 -0.019
## 71 V3 ~~ V6 0.637 -0.003 -0.003 -0.014 -0.014
## 36 extraversion =~ V18 0.579 0.109 0.006 0.015 0.015
## 61 V1 ~~ V6 0.468 0.003 0.003 0.012 0.012
## 113 V9 ~~ V24 0.404 -0.002 -0.002 -0.014 -0.014
## 118 V12 ~~ V24 0.343 -0.002 -0.002 -0.014 -0.014
## 96 V2 ~~ V12 0.341 -0.002 -0.002 -0.012 -0.012
## 43 neuroticism =~ V12 0.337 0.047 0.010 0.026 0.026
## 77 V5 ~~ V4 0.334 0.002 0.002 0.010 0.010
## 107 V7 ~~ V12 0.324 -0.002 -0.002 -0.014 -0.014
## 119 V18 ~~ V24 0.296 -0.002 -0.002 -0.014 -0.014
## 52 lying =~ V7 0.285 0.111 0.014 0.028 0.028
## 111 V9 ~~ V12 0.279 -0.002 -0.002 -0.011 -0.011
## 114 V6 ~~ V12 0.253 0.002 0.002 0.011 0.011
## 105 V7 ~~ V9 0.242 -0.004 -0.004 -0.027 -0.027
## 101 V4 ~~ V6 0.210 -0.002 -0.002 -0.008 -0.008
## 104 V4 ~~ V24 0.161 0.001 0.001 0.008 0.008
## 108 V7 ~~ V18 0.159 -0.001 -0.001 -0.010 -0.010
## 78 V5 ~~ V7 0.144 0.001 0.001 0.009 0.009
## 79 V5 ~~ V9 0.137 -0.001 -0.001 -0.008 -0.008
## 92 V2 ~~ V4 0.137 0.002 0.002 0.007 0.007
## 42 neuroticism =~ V6 0.128 -0.033 -0.007 -0.015 -0.015
## 32 extraversion =~ V7 0.021 0.031 0.002 0.003 0.003
## 81 V5 ~~ V12 0.017 0.000 0.000 -0.002 -0.002
## 103 V4 ~~ V18 0.009 0.000 0.000 -0.002 -0.002
## 40 neuroticism =~ V5 0.001 -0.002 0.000 -0.001 -0.001
## 72 V3 ~~ V12 0.000 0.000 0.000 0.000 0.000
#Edit the model specification
epi.model1 <- 'extraversion =~ V1 + V3 + V5 + V8
neuroticism =~ V2 + V4 + V7 + V9
lying =~ V6 + V12 + V18 + V24
neuroticism =~ V3'
#Reanalyze the model
epi.fit1 <- cfa(model = epi.model1, data = epi)
#Summarize the updated model
summary(epi.fit1, standardized = TRUE, fit.measures = TRUE)
## lavaan 0.6-2 ended normally after 126 iterations
##
## Optimization method NLMINB
## Number of free parameters 28
##
## Used Total
## Number of observations 3193 3570
##
## Estimator ML
## Model Fit Test Statistic 332.891
## Degrees of freedom 50
## P-value (Chi-square) 0.000
##
## Model test baseline model:
##
## Minimum Function Test Statistic 2196.019
## Degrees of freedom 66
## P-value 0.000
##
## User model versus baseline model:
##
## Comparative Fit Index (CFI) 0.867
## Tucker-Lewis Index (TLI) 0.825
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -23082.232
## Loglikelihood unrestricted model (H1) -22915.787
##
## Number of free parameters 28
## Akaike (AIC) 46220.465
## Bayesian (BIC) 46390.389
## Sample-size adjusted Bayesian (BIC) 46301.421
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.042
## 90 Percent Confidence Interval 0.038 0.046
## P-value RMSEA <= 0.05 0.999
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.040
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## extraversion =~
## V1 1.000 0.068 0.152
## V3 1.798 0.325 5.532 0.000 0.123 0.246
## V5 -2.268 0.360 -6.291 0.000 -0.155 -0.414
## V8 5.077 0.887 5.725 0.000 0.346 0.732
## neuroticism =~
## V2 1.000 0.222 0.445
## V4 0.432 0.053 8.134 0.000 0.096 0.194
## V7 1.493 0.093 16.025 0.000 0.331 0.675
## V9 1.186 0.074 15.938 0.000 0.263 0.530
## lying =~
## V6 1.000 0.135 0.272
## V12 -0.851 0.127 -6.699 0.000 -0.115 -0.290
## V18 -0.799 0.119 -6.728 0.000 -0.108 -0.294
## V24 1.115 0.157 7.087 0.000 0.151 0.347
## neuroticism =~
## V3 -0.732 0.066 -11.074 0.000 -0.163 -0.327
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## extraversion ~~
## neuroticism 0.004 0.001 4.953 0.000 0.283 0.283
## lying -0.003 0.001 -4.380 0.000 -0.346 -0.346
## neuroticism ~~
## lying -0.016 0.002 -7.337 0.000 -0.521 -0.521
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .V1 0.196 0.005 39.250 0.000 0.196 0.977
## .V3 0.217 0.006 34.642 0.000 0.217 0.878
## .V5 0.116 0.004 29.066 0.000 0.116 0.828
## .V8 0.104 0.014 7.603 0.000 0.104 0.465
## .V2 0.200 0.006 33.875 0.000 0.200 0.802
## .V4 0.235 0.006 39.046 0.000 0.235 0.962
## .V7 0.131 0.007 19.577 0.000 0.131 0.544
## .V9 0.178 0.006 29.830 0.000 0.178 0.720
## .V6 0.228 0.007 34.969 0.000 0.228 0.926
## .V12 0.144 0.004 34.186 0.000 0.144 0.916
## .V18 0.123 0.004 34.035 0.000 0.123 0.914
## .V24 0.166 0.005 31.188 0.000 0.166 0.879
## extraversion 0.005 0.001 3.265 0.001 1.000 1.000
## neuroticism 0.049 0.005 10.127 0.000 1.000 1.000
## lying 0.018 0.004 4.651 0.000 1.000 1.000
#Analyze the original model
epi.fit <- cfa(model = epi.model, data = epi)
#Analyze the updated model
epi.fit1 <- cfa(model = epi.model1, data = epi)
#Compare those models
anova(epi.fit, epi.fit1)
## Chi Square Difference Test
##
## Df AIC BIC Chisq Chisq diff Df diff Pr(>Chisq)
## epi.fit1 50 46220 46390 332.89
## epi.fit 52 46418 46576 534.43 201.53 2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Analyze the original model
epi.fit <- cfa(model = epi.model, data = epi)
#Find the fit indices for the original model
fitmeasures(epi.fit)[c("aic", "ecvi")]
## aic ecvi
## 4.641800e+04 1.836599e-01
#Analyze the updated model
epi.fit1 <- cfa(model = epi.model1, data = epi)
#Find the fit indices for the updated model
fitmeasures(epi.fit1)[c("aic", "ecvi")]
## aic ecvi
## 46220.464546 0.121795
Chapter 3 - Troubleshooting Model Errors and Diagrams
Heywood Cases on the Latent Variable:
Heywood Cases on the Manifest Variable (negative error variances):
Create Diagrams with semPaths():
Example code includes:
badlatentdata <- readr::read_csv("./RInputFiles/badlatentdata.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## V1 = col_double(),
## V2 = col_double(),
## V3 = col_double(),
## V4 = col_double(),
## V5 = col_double(),
## V6 = col_double()
## )
badvardata <- readr::read_csv("./RInputFiles/badvardata.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## V1 = col_double(),
## V2 = col_double(),
## V3 = col_double(),
## V4 = col_double(),
## V5 = col_double(),
## V6 = col_double()
## )
adoptsurvey <- badlatentdata %>%
select(-X1) %>%
rename(pictures=V1, background=V2, loveskids=V3, energy=V4, wagstail=V5, playful=V6)
#Look at the data
str(adoptsurvey, give.attr=FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 6 variables:
## $ pictures : num 3.66318 -0.00508 2.99697 -0.90249 4.54211 ...
## $ background: num 3.07 7.7 1.51 3.03 7.22 ...
## $ loveskids : num 10.31 3.06 6.61 1.54 3.38 ...
## $ energy : num 3.68 2.42 3.51 -3.04 12.93 ...
## $ wagstail : num 5.26 7.05 4.25 2.17 6.23 ...
## $ playful : num 8.275 11.727 0.675 2.457 13.43 ...
head(adoptsurvey)
## # A tibble: 6 x 6
## pictures background loveskids energy wagstail playful
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.66 3.07 10.3 3.68 5.26 8.28
## 2 -0.00508 7.70 3.06 2.42 7.05 11.7
## 3 3.00 1.51 6.61 3.51 4.25 0.675
## 4 -0.902 3.03 1.54 -3.04 2.17 2.46
## 5 4.54 7.22 3.38 12.9 6.23 13.4
## 6 0.0257 -4.35 -1.95 -6.07 3.13 5.60
#Build the model
adopt.model <- 'goodstory =~ pictures + background + loveskids
inperson =~ energy + wagstail + playful'
#Analyze the model
adopt.fit <- cfa(model = adopt.model, data = adoptsurvey)
## Warning in lav_object_post_check(object): lavaan WARNING: covariance matrix of latent variables
## is not positive definite;
## use lavInspect(fit, "cov.lv") to investigate.
lavInspect(adopt.fit, "cov.lv")
## gdstry inprsn
## goodstory 0.397
## inperson 4.780 4.505
summary(adopt.fit, standardized=TRUE, fit.measures=TRUE)
## lavaan 0.6-2 ended normally after 61 iterations
##
## Optimization method NLMINB
## Number of free parameters 13
##
## Number of observations 100
##
## Estimator ML
## Model Fit Test Statistic 15.674
## Degrees of freedom 8
## P-value (Chi-square) 0.047
##
## Model test baseline model:
##
## Minimum Function Test Statistic 74.694
## Degrees of freedom 15
## P-value 0.000
##
## User model versus baseline model:
##
## Comparative Fit Index (CFI) 0.871
## Tucker-Lewis Index (TLI) 0.759
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -1666.548
## Loglikelihood unrestricted model (H1) -1658.711
##
## Number of free parameters 13
## Akaike (AIC) 3359.096
## Bayesian (BIC) 3392.963
## Sample-size adjusted Bayesian (BIC) 3351.906
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.098
## 90 Percent Confidence Interval 0.010 0.170
## P-value RMSEA <= 0.05 0.126
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.080
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## goodstory =~
## pictures 1.000 0.630 0.200
## background 1.089 0.347 3.135 0.002 0.686 0.168
## loveskids 0.041 0.259 0.158 0.874 0.026 0.006
## inperson =~
## energy 1.000 2.122 0.538
## wagstail 1.134 0.300 3.780 0.000 2.406 0.473
## playful 0.601 0.213 2.823 0.005 1.275 0.329
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## goodstory ~~
## inperson 4.780 1.248 3.830 0.000 3.575 3.575
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .pictures 9.570 1.749 5.471 0.000 9.570 0.960
## .background 16.198 2.641 6.133 0.000 16.198 0.972
## .loveskids 21.675 3.065 7.071 0.000 21.675 1.000
## .energy 11.031 1.920 5.745 0.000 11.031 0.710
## .wagstail 20.085 3.199 6.278 0.000 20.085 0.776
## .playful 13.382 1.955 6.845 0.000 13.382 0.892
## goodstory 0.397 1.176 0.338 0.736 1.000 1.000
## inperson 4.505 1.910 2.359 0.018 1.000 1.000
#Edit the original model
adopt.model <- 'goodstory =~ pictures + background + loveskids + energy + wagstail + playful'
#Analyze the model
adopt.fit <- cfa(model = adopt.model, data = adoptsurvey)
#Look for Heywood cases
summary(adopt.fit, standardized = TRUE, fit.measures = TRUE)
## lavaan 0.6-2 ended normally after 49 iterations
##
## Optimization method NLMINB
## Number of free parameters 12
##
## Number of observations 100
##
## Estimator ML
## Model Fit Test Statistic 27.071
## Degrees of freedom 9
## P-value (Chi-square) 0.001
##
## Model test baseline model:
##
## Minimum Function Test Statistic 74.694
## Degrees of freedom 15
## P-value 0.000
##
## User model versus baseline model:
##
## Comparative Fit Index (CFI) 0.697
## Tucker-Lewis Index (TLI) 0.495
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -1672.246
## Loglikelihood unrestricted model (H1) -1658.711
##
## Number of free parameters 12
## Akaike (AIC) 3368.493
## Bayesian (BIC) 3399.755
## Sample-size adjusted Bayesian (BIC) 3361.856
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.142
## 90 Percent Confidence Interval 0.082 0.205
## P-value RMSEA <= 0.05 0.009
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.086
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## goodstory =~
## pictures 1.000 1.773 0.562
## background 0.892 0.337 2.650 0.008 1.581 0.387
## loveskids 0.547 0.344 1.587 0.112 0.969 0.208
## energy 1.194 0.372 3.214 0.001 2.118 0.537
## wagstail 1.712 0.517 3.310 0.001 3.035 0.597
## playful 0.773 0.312 2.480 0.013 1.371 0.354
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .pictures 6.824 1.323 5.160 0.000 6.824 0.685
## .background 14.168 2.228 6.359 0.000 14.168 0.850
## .loveskids 20.736 3.009 6.891 0.000 20.736 0.957
## .energy 11.051 2.049 5.394 0.000 11.051 0.711
## .wagstail 16.661 3.486 4.779 0.000 16.661 0.644
## .playful 13.128 2.021 6.496 0.000 13.128 0.875
## goodstory 3.143 1.369 2.296 0.022 1.000 1.000
adoptsurvey <- badvardata %>%
select(-X1) %>%
rename(pictures=V1, background=V2, loveskids=V3, energy=V4, wagstail=V5, playful=V6)
str(adoptsurvey, give.attr=FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 6 variables:
## $ pictures : num 3.71 1.24 1.19 -1.26 4.58 ...
## $ background: num -0.964 6.38 -4.329 5.196 -0.145 ...
## $ loveskids : num 3.86 5.95 8.23 2.46 9.53 ...
## $ energy : num -6.73 1.61 4.09 7.6 -3.13 ...
## $ wagstail : num -1.199 0.532 4.59 3.699 2.546 ...
## $ playful : num 4.1 1.93 4.04 4.56 3.43 ...
summary(adoptsurvey)
## pictures background loveskids energy
## Min. :-4.5482 Min. :-4.83064 Min. :-7.162 Min. :-6.729
## 1st Qu.:-0.5815 1st Qu.:-0.03834 1st Qu.: 1.751 1st Qu.: 0.386
## Median : 1.7705 Median : 3.33339 Median : 4.585 Median : 2.081
## Mean : 1.7912 Mean : 3.04096 Mean : 5.178 Mean : 2.362
## 3rd Qu.: 3.6228 3rd Qu.: 5.57599 3rd Qu.: 8.709 3rd Qu.: 4.906
## Max. : 9.4674 Max. :14.81218 Max. :18.237 Max. :12.091
## wagstail playful
## Min. :-9.945 Min. :-4.913
## 1st Qu.:-1.226 1st Qu.: 1.811
## Median : 2.160 Median : 3.916
## Mean : 2.346 Mean : 3.711
## 3rd Qu.: 5.242 3rd Qu.: 5.751
## Max. :19.811 Max. :11.446
#Build the model
adopt.model <- 'goodstory =~ pictures + background + loveskids
inperson =~ energy + wagstail + playful'
#Analyze the model
adopt.fit <- cfa(model=adopt.model, data=adoptsurvey)
## Warning in lav_object_post_check(object): lavaan WARNING: some estimated ov
## variances are negative
#Summarize the model to view the negative variances
summary(adopt.fit, standardized=TRUE, fit.measures=TRUE, rsquare=TRUE)
## lavaan 0.6-2 ended normally after 303 iterations
##
## Optimization method NLMINB
## Number of free parameters 13
##
## Number of observations 100
##
## Estimator ML
## Model Fit Test Statistic 7.134
## Degrees of freedom 8
## P-value (Chi-square) 0.522
##
## Model test baseline model:
##
## Minimum Function Test Statistic 25.380
## Degrees of freedom 15
## P-value 0.045
##
## User model versus baseline model:
##
## Comparative Fit Index (CFI) 1.000
## Tucker-Lewis Index (TLI) 1.156
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -1649.956
## Loglikelihood unrestricted model (H1) -1646.389
##
## Number of free parameters 13
## Akaike (AIC) 3325.912
## Bayesian (BIC) 3359.779
## Sample-size adjusted Bayesian (BIC) 3318.722
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.000
## 90 Percent Confidence Interval 0.000 0.109
## P-value RMSEA <= 0.05 0.686
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.050
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## goodstory =~
## pictures 1.000 1.360 0.437
## background 1.471 0.763 1.928 0.054 2.000 0.521
## loveskids 1.746 0.892 1.958 0.050 2.375 0.501
## inperson =~
## energy 1.000 0.208 0.058
## wagstail 45.262 1090.143 0.042 0.967 9.409 1.969
## playful 0.869 1.110 0.783 0.434 0.181 0.054
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## goodstory ~~
## inperson -0.014 0.332 -0.041 0.967 -0.048 -0.048
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .pictures 7.814 1.514 5.162 0.000 7.814 0.809
## .background 10.762 2.695 3.993 0.000 10.762 0.729
## .loveskids 16.791 3.936 4.266 0.000 16.791 0.749
## .energy 12.642 2.066 6.119 0.000 12.642 0.997
## .wagstail -65.677 2124.215 -0.031 0.975 -65.677 -2.875
## .playful 11.148 1.760 6.335 0.000 11.148 0.997
## goodstory 1.850 1.310 1.411 0.158 1.000 1.000
## inperson 0.043 1.046 0.041 0.967 1.000 1.000
##
## R-Square:
## Estimate
## pictures 0.191
## background 0.271
## loveskids 0.251
## energy 0.003
## wagstail NA
## playful 0.003
#View the variance of the problem manifest variable
var(adoptsurvey$wagstail)
## [1] 23.07446
#Update the model using 5 decimal places
adopt.model2 <- 'goodstory =~ pictures + background + loveskids
inperson =~ energy + wagstail + playful
wagstail~~23.07446*wagstail'
#Analyze and summarize the updated model
adopt.fit2 <- cfa(model = adopt.model2, data = adoptsurvey)
summary(adopt.fit2, standardized=TRUE, fit.measures=TRUE, rsquare=TRUE)
## lavaan 0.6-2 ended normally after 69 iterations
##
## Optimization method NLMINB
## Number of free parameters 12
##
## Number of observations 100
##
## Estimator ML
## Model Fit Test Statistic 8.493
## Degrees of freedom 9
## P-value (Chi-square) 0.485
##
## Model test baseline model:
##
## Minimum Function Test Statistic 25.380
## Degrees of freedom 15
## P-value 0.045
##
## User model versus baseline model:
##
## Comparative Fit Index (CFI) 1.000
## Tucker-Lewis Index (TLI) 1.081
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -1650.635
## Loglikelihood unrestricted model (H1) -1646.389
##
## Number of free parameters 12
## Akaike (AIC) 3325.270
## Bayesian (BIC) 3356.532
## Sample-size adjusted Bayesian (BIC) 3318.633
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.000
## 90 Percent Confidence Interval 0.000 0.108
## P-value RMSEA <= 0.05 0.664
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.058
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## goodstory =~
## pictures 1.000 1.344 0.432
## background 1.461 0.758 1.928 0.054 1.964 0.511
## loveskids 1.818 0.947 1.919 0.055 2.444 0.516
## inperson =~
## energy 1.000 0.959 0.269
## wagstail 1.391 2.244 0.620 0.535 1.334 0.268
## playful 0.807 1.640 0.492 0.623 0.774 0.231
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## goodstory ~~
## inperson -0.077 0.450 -0.172 0.863 -0.060 -0.060
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .wagstail 23.074 23.074 0.928
## .pictures 7.857 1.510 5.203 0.000 7.857 0.813
## .background 10.906 2.672 4.082 0.000 10.906 0.739
## .loveskids 16.461 4.103 4.012 0.000 16.461 0.734
## .energy 11.765 2.683 4.385 0.000 11.765 0.928
## .playful 10.582 2.082 5.084 0.000 10.582 0.946
## goodstory 1.807 1.296 1.395 0.163 1.000 1.000
## inperson 0.920 2.209 0.416 0.677 1.000 1.000
##
## R-Square:
## Estimate
## wagstail 0.072
## pictures 0.187
## background 0.261
## loveskids 0.266
## energy 0.072
## playful 0.054
#Create a default picture
semPlot::semPaths(adopt.fit)
#Update the default picture
semPlot::semPaths(object = adopt.fit, layout="tree", rotation=2)
#Update the default picture
semPlot::semPaths(object = adopt.fit, layout = "tree", rotation = 2, whatLabels = "std",
edge.label.cex = 1, what = "std", edge.color = "blue"
)
Chapter 4 - Full Example and Extension
Model WAIS-III IQ Scale:
Update WAIS-III Model:
Hierarchical Model of IQ:
Wrap Up:
Example code includes:
IQdata <- readr::read_csv("./RInputFiles/IQdata.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## inform = col_integer(),
## simil = col_integer(),
## vocab = col_integer(),
## compreh = col_integer(),
## digspan = col_integer(),
## arith = col_integer(),
## piccomp = col_integer(),
## block = col_integer(),
## matrixreason = col_integer(),
## symbolsearch = col_integer(),
## digsym = col_integer(),
## lnseq = col_integer()
## )
glimpse(IQdata)
## Observations: 300
## Variables: 13
## $ X1 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...
## $ inform <int> 31, 15, 13, 13, 22, 25, 20, 18, 21, 22, 16, 23, 1...
## $ simil <int> 23, 20, 22, 21, 21, 22, 25, 25, 22, 25, 25, 24, 1...
## $ vocab <int> 63, 44, 40, 51, 55, 61, 45, 61, 57, 56, 62, 53, 4...
## $ compreh <int> 27, 21, 28, 21, 28, 27, 23, 28, 27, 22, 28, 30, 2...
## $ digspan <int> 20, 13, 14, 22, 17, 20, 13, 22, 14, 15, 15, 26, 1...
## $ arith <int> 18, 12, 13, 13, 10, 20, 16, 14, 16, 10, 13, 21, 1...
## $ piccomp <int> 18, 13, 13, 16, 13, 18, 16, 22, 16, 16, 20, 19, 1...
## $ block <int> 50, 29, 28, 36, 22, 59, 33, 43, 40, 31, 35, 59, 4...
## $ matrixreason <int> 21, 17, 16, 14, 13, 18, 14, 18, 13, 13, 21, 16, 1...
## $ symbolsearch <int> 38, 24, 25, 27, 27, 38, 31, 42, 34, 29, 37, 40, 3...
## $ digsym <int> 57, 56, 72, 67, 60, 78, 60, 45, 40, 57, 63, 87, 7...
## $ lnseq <int> 15, 12, 13, 18, 15, 16, 12, 30, 19, 16, 23, 16, 1...
IQdata <- IQdata %>%
select(-X1)
glimpse(IQdata)
## Observations: 300
## Variables: 12
## $ inform <int> 31, 15, 13, 13, 22, 25, 20, 18, 21, 22, 16, 23, 1...
## $ simil <int> 23, 20, 22, 21, 21, 22, 25, 25, 22, 25, 25, 24, 1...
## $ vocab <int> 63, 44, 40, 51, 55, 61, 45, 61, 57, 56, 62, 53, 4...
## $ compreh <int> 27, 21, 28, 21, 28, 27, 23, 28, 27, 22, 28, 30, 2...
## $ digspan <int> 20, 13, 14, 22, 17, 20, 13, 22, 14, 15, 15, 26, 1...
## $ arith <int> 18, 12, 13, 13, 10, 20, 16, 14, 16, 10, 13, 21, 1...
## $ piccomp <int> 18, 13, 13, 16, 13, 18, 16, 22, 16, 16, 20, 19, 1...
## $ block <int> 50, 29, 28, 36, 22, 59, 33, 43, 40, 31, 35, 59, 4...
## $ matrixreason <int> 21, 17, 16, 14, 13, 18, 14, 18, 13, 13, 21, 16, 1...
## $ symbolsearch <int> 38, 24, 25, 27, 27, 38, 31, 42, 34, 29, 37, 40, 3...
## $ digsym <int> 57, 56, 72, 67, 60, 78, 60, 45, 40, 57, 63, 87, 7...
## $ lnseq <int> 15, 12, 13, 18, 15, 16, 12, 30, 19, 16, 23, 16, 1...
#Build a four-factor model
wais.model <- 'verbalcomp =~ vocab + simil + inform + compreh
workingmemory =~ arith + digspan + lnseq
perceptorg =~ piccomp + block + matrixreason
processing =~ digsym + symbolsearch'
#Analyze the model
wais.fit <- cfa(model=wais.model, data=IQdata)
## Warning in lav_object_post_check(object): lavaan WARNING: covariance matrix of latent variables
## is not positive definite;
## use lavInspect(fit, "cov.lv") to investigate.
#Summarize the model
summary(wais.fit, standardized=TRUE, fit.measures=TRUE, rsquare=TRUE)
## lavaan 0.6-2 ended normally after 153 iterations
##
## Optimization method NLMINB
## Number of free parameters 30
##
## Number of observations 300
##
## Estimator ML
## Model Fit Test Statistic 233.268
## Degrees of freedom 48
## P-value (Chi-square) 0.000
##
## Model test baseline model:
##
## Minimum Function Test Statistic 1042.916
## Degrees of freedom 66
## P-value 0.000
##
## User model versus baseline model:
##
## Comparative Fit Index (CFI) 0.810
## Tucker-Lewis Index (TLI) 0.739
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -9939.800
## Loglikelihood unrestricted model (H1) -9823.166
##
## Number of free parameters 30
## Akaike (AIC) 19939.599
## Bayesian (BIC) 20050.713
## Sample-size adjusted Bayesian (BIC) 19955.570
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.113
## 90 Percent Confidence Interval 0.099 0.128
## P-value RMSEA <= 0.05 0.000
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.073
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## verbalcomp =~
## vocab 1.000 6.282 0.879
## simil 0.296 0.031 9.470 0.000 1.859 0.581
## inform 0.450 0.043 10.483 0.000 2.825 0.645
## compreh 0.315 0.035 8.986 0.000 1.979 0.551
## workingmemory =~
## arith 1.000 2.530 0.845
## digspan 0.875 0.137 6.373 0.000 2.213 0.561
## lnseq 0.225 0.106 2.130 0.033 0.570 0.142
## perceptorg =~
## piccomp 1.000 1.391 0.596
## block 3.988 0.421 9.477 0.000 5.546 0.719
## matrixreason 0.909 0.127 7.171 0.000 1.264 0.494
## processing =~
## digsym 1.000 2.809 0.239
## symbolsearch 1.065 0.300 3.547 0.000 2.990 0.724
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## verbalcomp ~~
## workingmemory 6.120 1.232 4.969 0.000 0.385 0.385
## perceptorg 5.644 0.868 6.503 0.000 0.646 0.646
## processing 10.050 3.150 3.190 0.001 0.570 0.570
## workingmemory ~~
## perceptorg 2.437 0.371 6.561 0.000 0.693 0.693
## processing 2.701 0.984 2.745 0.006 0.380 0.380
## perceptorg ~~
## processing 4.027 1.200 3.356 0.001 1.031 1.031
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .vocab 11.573 2.656 4.357 0.000 11.573 0.227
## .simil 6.792 0.620 10.951 0.000 6.792 0.663
## .inform 11.201 1.084 10.330 0.000 11.201 0.584
## .compreh 8.969 0.804 11.157 0.000 8.969 0.696
## .arith 2.560 0.901 2.842 0.004 2.560 0.286
## .digspan 10.653 1.102 9.666 0.000 10.653 0.685
## .lnseq 15.750 1.294 12.173 0.000 15.750 0.980
## .piccomp 3.505 0.323 10.851 0.000 3.505 0.644
## .block 28.761 3.207 8.968 0.000 28.761 0.483
## .matrixreason 4.957 0.431 11.509 0.000 4.957 0.756
## .digsym 130.314 10.847 12.014 0.000 130.314 0.943
## .symbolsearch 8.127 2.480 3.277 0.001 8.127 0.476
## verbalcomp 39.459 4.757 8.294 0.000 1.000 1.000
## workingmemory 6.399 1.122 5.703 0.000 1.000 1.000
## perceptorg 1.934 0.371 5.211 0.000 1.000 1.000
## processing 7.889 4.309 1.831 0.067 1.000 1.000
##
## R-Square:
## Estimate
## vocab 0.773
## simil 0.337
## inform 0.416
## compreh 0.304
## arith 0.714
## digspan 0.315
## lnseq 0.020
## piccomp 0.356
## block 0.517
## matrixreason 0.244
## digsym 0.057
## symbolsearch 0.524
#Edit the original model
wais.model <- 'verbalcomp =~ vocab + simil + inform + compreh
workingmemory =~ arith + digspan + lnseq
perceptorg =~ piccomp + block + matrixreason + digsym + symbolsearch'
#Analyze the model
wais.fit <- cfa(model=wais.model, data=IQdata)
#Summarize the model
summary(wais.fit, standardized=TRUE, fit.measures=TRUE, rsquare=TRUE)
## lavaan 0.6-2 ended normally after 110 iterations
##
## Optimization method NLMINB
## Number of free parameters 27
##
## Number of observations 300
##
## Estimator ML
## Model Fit Test Statistic 252.809
## Degrees of freedom 51
## P-value (Chi-square) 0.000
##
## Model test baseline model:
##
## Minimum Function Test Statistic 1042.916
## Degrees of freedom 66
## P-value 0.000
##
## User model versus baseline model:
##
## Comparative Fit Index (CFI) 0.793
## Tucker-Lewis Index (TLI) 0.733
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -9949.570
## Loglikelihood unrestricted model (H1) -9823.166
##
## Number of free parameters 27
## Akaike (AIC) 19953.141
## Bayesian (BIC) 20053.143
## Sample-size adjusted Bayesian (BIC) 19967.515
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.115
## 90 Percent Confidence Interval 0.101 0.129
## P-value RMSEA <= 0.05 0.000
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.076
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## verbalcomp =~
## vocab 1.000 6.281 0.879
## simil 0.296 0.031 9.483 0.000 1.861 0.581
## inform 0.449 0.043 10.481 0.000 2.822 0.644
## compreh 0.315 0.035 8.999 0.000 1.981 0.552
## workingmemory =~
## arith 1.000 2.528 0.844
## digspan 0.881 0.152 5.786 0.000 2.227 0.565
## lnseq 0.205 0.107 1.920 0.055 0.518 0.129
## perceptorg =~
## piccomp 1.000 1.517 0.650
## block 3.739 0.390 9.583 0.000 5.672 0.735
## matrixreason 0.832 0.117 7.099 0.000 1.262 0.493
## digsym 1.603 0.507 3.160 0.002 2.431 0.207
## symbolsearch 1.880 0.204 9.236 0.000 2.852 0.690
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## verbalcomp ~~
## workingmemory 6.132 1.234 4.970 0.000 0.386 0.386
## perceptorg 5.892 0.886 6.647 0.000 0.618 0.618
## workingmemory ~~
## perceptorg 2.227 0.362 6.149 0.000 0.581 0.581
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .vocab 11.577 2.651 4.367 0.000 11.577 0.227
## .simil 6.787 0.620 10.950 0.000 6.787 0.662
## .inform 11.218 1.085 10.342 0.000 11.218 0.585
## .compreh 8.962 0.803 11.155 0.000 8.962 0.696
## .arith 2.571 1.014 2.535 0.011 2.571 0.287
## .digspan 10.590 1.161 9.121 0.000 10.590 0.681
## .lnseq 15.807 1.297 12.183 0.000 15.807 0.983
## .piccomp 3.138 0.317 9.913 0.000 3.138 0.577
## .block 27.343 3.226 8.476 0.000 27.343 0.459
## .matrixreason 4.960 0.441 11.243 0.000 4.960 0.757
## .digsym 132.291 10.925 12.109 0.000 132.291 0.957
## .symbolsearch 8.936 0.957 9.333 0.000 8.936 0.524
## verbalcomp 39.455 4.754 8.299 0.000 1.000 1.000
## workingmemory 6.388 1.215 5.259 0.000 1.000 1.000
## perceptorg 2.301 0.408 5.646 0.000 1.000 1.000
##
## R-Square:
## Estimate
## vocab 0.773
## simil 0.338
## inform 0.415
## compreh 0.304
## arith 0.713
## digspan 0.319
## lnseq 0.017
## piccomp 0.423
## block 0.541
## matrixreason 0.243
## digsym 0.043
## symbolsearch 0.476
#Update the default picture
semPlot::semPaths(object = wais.fit, layout = "tree", rotation = 1, whatLabels = "std",
edge.label.cex = 1, what = "std", edge.color = "black"
)
#Examine modification indices
modificationindices(wais.fit, sort = TRUE)
## lhs op rhs mi epc sepc.lv sepc.all sepc.nox
## 66 simil ~~ inform 35.879 -3.757 -3.757 -0.431 -0.431
## 56 vocab ~~ inform 28.377 9.783 9.783 0.858 0.858
## 48 perceptorg =~ vocab 21.865 -2.077 -3.151 -0.441 -0.441
## 115 block ~~ matrixreason 16.209 -3.622 -3.622 -0.311 -0.311
## 96 arith ~~ block 15.061 3.679 3.679 0.439 0.439
## 117 block ~~ symbolsearch 13.144 5.725 5.725 0.366 0.366
## 47 workingmemory =~ symbolsearch 12.272 -0.467 -1.181 -0.286 -0.286
## 81 inform ~~ block 12.269 4.358 4.358 0.249 0.249
## 64 vocab ~~ digsym 11.578 -11.261 -11.261 -0.288 -0.288
## 40 workingmemory =~ simil 11.383 0.278 0.703 0.220 0.220
## 72 simil ~~ block 10.605 -3.084 -3.084 -0.226 -0.226
## 45 workingmemory =~ matrixreason 9.685 0.267 0.675 0.264 0.264
## 95 arith ~~ piccomp 9.463 -0.892 -0.892 -0.314 -0.314
## 60 vocab ~~ lnseq 9.425 -3.486 -3.486 -0.258 -0.258
## 67 simil ~~ compreh 9.356 1.587 1.587 0.203 0.203
## 44 workingmemory =~ block 9.258 0.765 1.933 0.251 0.251
## 51 perceptorg =~ compreh 9.177 0.601 0.912 0.254 0.254
## 62 vocab ~~ block 8.712 -5.377 -5.377 -0.302 -0.302
## 73 simil ~~ matrixreason 8.672 1.065 1.065 0.184 0.184
## 106 lnseq ~~ piccomp 8.620 1.298 1.298 0.184 0.184
## 91 compreh ~~ digsym 8.155 5.908 5.908 0.172 0.172
## 59 vocab ~~ digspan 8.127 2.849 2.849 0.257 0.257
## 37 verbalcomp =~ digsym 7.803 -0.464 -2.917 -0.248 -0.248
## 68 simil ~~ arith 7.534 1.064 1.064 0.255 0.255
## 99 arith ~~ symbolsearch 7.468 -1.391 -1.391 -0.290 -0.290
## 57 vocab ~~ compreh 7.107 -3.508 -3.508 -0.344 -0.344
## 87 compreh ~~ lnseq 7.001 1.887 1.887 0.159 0.159
## 97 arith ~~ matrixreason 6.391 0.848 0.848 0.237 0.237
## 107 lnseq ~~ block 5.677 3.289 3.289 0.158 0.158
## 34 verbalcomp =~ piccomp 5.507 0.071 0.447 0.192 0.192
## 78 inform ~~ digspan 5.435 -1.649 -1.649 -0.151 -0.151
## 33 verbalcomp =~ lnseq 5.250 -0.104 -0.652 -0.163 -0.163
## 54 perceptorg =~ lnseq 4.644 0.512 0.777 0.194 0.194
## 39 workingmemory =~ vocab 4.638 -0.406 -1.025 -0.143 -0.143
## 102 digspan ~~ block 4.564 -2.689 -2.689 -0.158 -0.158
## 35 verbalcomp =~ block 4.551 -0.218 -1.371 -0.178 -0.178
## 88 compreh ~~ piccomp 4.455 0.728 0.728 0.137 0.137
## 112 piccomp ~~ matrixreason 4.306 0.568 0.568 0.144 0.144
## 101 digspan ~~ piccomp 4.218 0.808 0.808 0.140 0.140
## 46 workingmemory =~ digsym 4.139 -0.852 -2.152 -0.183 -0.183
## 71 simil ~~ piccomp 4.029 0.607 0.607 0.132 0.132
## 76 inform ~~ compreh 3.789 -1.367 -1.367 -0.136 -0.136
## 70 simil ~~ lnseq 3.693 -1.200 -1.200 -0.116 -0.116
## 50 perceptorg =~ inform 3.487 0.444 0.673 0.154 0.154
## 58 vocab ~~ arith 3.451 -1.457 -1.457 -0.267 -0.267
## 55 vocab ~~ simil 3.393 2.239 2.239 0.253 0.253
## 113 piccomp ~~ digsym 3.375 2.419 2.419 0.119 0.119
## 93 arith ~~ digspan 3.274 7.960 7.960 1.526 1.526
## 86 compreh ~~ digspan 3.234 -1.110 -1.110 -0.114 -0.114
## 80 inform ~~ piccomp 2.871 -0.672 -0.672 -0.113 -0.113
## 104 digspan ~~ digsym 2.754 -3.822 -3.822 -0.102 -0.102
## 114 piccomp ~~ symbolsearch 2.677 -0.731 -0.731 -0.138 -0.138
## 89 compreh ~~ block 2.551 1.725 1.725 0.110 0.110
## 90 compreh ~~ matrixreason 2.342 -0.632 -0.632 -0.095 -0.095
## 74 simil ~~ digsym 2.021 -2.575 -2.575 -0.086 -0.086
## 43 workingmemory =~ piccomp 1.899 -0.104 -0.262 -0.113 -0.113
## 49 perceptorg =~ simil 1.675 0.227 0.345 0.108 0.108
## 92 compreh ~~ symbolsearch 1.646 0.764 0.764 0.085 0.085
## 111 piccomp ~~ block 1.591 -1.084 -1.084 -0.117 -0.117
## 85 compreh ~~ arith 1.350 -0.514 -0.514 -0.107 -0.107
## 32 verbalcomp =~ digspan 1.224 0.058 0.365 0.092 0.092
## 79 inform ~~ lnseq 0.998 -0.815 -0.815 -0.061 -0.061
## 69 simil ~~ digspan 0.996 0.540 0.540 0.064 0.064
## 53 perceptorg =~ digspan 0.942 -0.710 -1.077 -0.273 -0.273
## 77 inform ~~ arith 0.890 0.480 0.480 0.089 0.089
## 116 block ~~ digsym 0.805 3.770 3.770 0.063 0.063
## 120 digsym ~~ symbolsearch 0.724 1.948 1.948 0.057 0.057
## 100 digspan ~~ lnseq 0.703 -0.688 -0.688 -0.053 -0.053
## 83 inform ~~ digsym 0.667 1.935 1.935 0.050 0.050
## 36 verbalcomp =~ matrixreason 0.543 0.025 0.159 0.062 0.062
## 61 vocab ~~ piccomp 0.529 0.414 0.414 0.069 0.069
## 105 digspan ~~ symbolsearch 0.481 -0.475 -0.475 -0.049 -0.049
## 52 perceptorg =~ arith 0.478 -0.694 -1.052 -0.352 -0.352
## 98 arith ~~ digsym 0.474 -1.135 -1.135 -0.062 -0.062
## 94 arith ~~ lnseq 0.430 -0.496 -0.496 -0.078 -0.078
## 31 verbalcomp =~ arith 0.237 -0.029 -0.182 -0.061 -0.061
## 103 digspan ~~ matrixreason 0.226 0.221 0.221 0.030 0.030
## 42 workingmemory =~ compreh 0.190 -0.041 -0.103 -0.029 -0.029
## 75 simil ~~ symbolsearch 0.188 -0.227 -0.227 -0.029 -0.029
## 63 vocab ~~ matrixreason 0.143 -0.253 -0.253 -0.033 -0.033
## 109 lnseq ~~ digsym 0.128 -0.951 -0.951 -0.021 -0.021
## 38 verbalcomp =~ symbolsearch 0.077 0.015 0.094 0.023 0.023
## 118 matrixreason ~~ digsym 0.060 -0.380 -0.380 -0.015 -0.015
## 41 workingmemory =~ inform 0.037 0.021 0.053 0.012 0.012
## 119 matrixreason ~~ symbolsearch 0.031 -0.085 -0.085 -0.013 -0.013
## 108 lnseq ~~ matrixreason 0.017 0.069 0.069 0.008 0.008
## 110 lnseq ~~ symbolsearch 0.009 0.072 0.072 0.006 0.006
## 65 vocab ~~ symbolsearch 0.005 -0.068 -0.068 -0.007 -0.007
## 84 inform ~~ symbolsearch 0.004 -0.045 -0.045 -0.004 -0.004
## 82 inform ~~ matrixreason 0.004 0.029 0.029 0.004 0.004
#Update the three-factor model
wais.model2 <- 'verbalcomp =~ vocab + simil + inform + compreh
workingmemory =~ arith + digspan + lnseq
perceptorg =~ piccomp + block + matrixreason + digsym + symbolsearch
simil ~~ inform'
#Analyze the three-factor model where data is IQdata
wais.fit2 <- cfa(model=wais.model2, data=IQdata)
#Summarize the three-factor model
summary(wais.fit2, standardized=TRUE, fit.measures=TRUE, rsquare=TRUE)
## lavaan 0.6-2 ended normally after 114 iterations
##
## Optimization method NLMINB
## Number of free parameters 28
##
## Number of observations 300
##
## Estimator ML
## Model Fit Test Statistic 212.813
## Degrees of freedom 50
## P-value (Chi-square) 0.000
##
## Model test baseline model:
##
## Minimum Function Test Statistic 1042.916
## Degrees of freedom 66
## P-value 0.000
##
## User model versus baseline model:
##
## Comparative Fit Index (CFI) 0.833
## Tucker-Lewis Index (TLI) 0.780
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -9929.572
## Loglikelihood unrestricted model (H1) -9823.166
##
## Number of free parameters 28
## Akaike (AIC) 19915.144
## Bayesian (BIC) 20018.850
## Sample-size adjusted Bayesian (BIC) 19930.051
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.104
## 90 Percent Confidence Interval 0.090 0.119
## P-value RMSEA <= 0.05 0.000
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.071
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## verbalcomp =~
## vocab 1.000 5.888 0.824
## simil 0.361 0.035 10.184 0.000 2.125 0.664
## inform 0.525 0.048 10.857 0.000 3.090 0.706
## compreh 0.334 0.036 9.349 0.000 1.965 0.547
## workingmemory =~
## arith 1.000 2.565 0.857
## digspan 0.857 0.149 5.768 0.000 2.199 0.558
## lnseq 0.193 0.104 1.850 0.064 0.495 0.123
## perceptorg =~
## piccomp 1.000 1.515 0.650
## block 3.737 0.390 9.581 0.000 5.662 0.734
## matrixreason 0.843 0.118 7.176 0.000 1.278 0.499
## digsym 1.615 0.508 3.181 0.001 2.446 0.208
## symbolsearch 1.875 0.203 9.218 0.000 2.841 0.688
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .simil ~~
## .inform -3.738 0.606 -6.169 0.000 -3.738 -0.503
## verbalcomp ~~
## workingmemory 6.278 1.181 5.315 0.000 0.416 0.416
## perceptorg 5.654 0.859 6.583 0.000 0.634 0.634
## workingmemory ~~
## perceptorg 2.237 0.363 6.172 0.000 0.576 0.576
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .vocab 16.365 2.375 6.892 0.000 16.365 0.321
## .simil 5.734 0.610 9.399 0.000 5.734 0.560
## .inform 9.635 1.095 8.801 0.000 9.635 0.502
## .compreh 9.026 0.791 11.413 0.000 9.026 0.700
## .arith 2.380 1.037 2.294 0.022 2.380 0.266
## .digspan 10.715 1.154 9.282 0.000 10.715 0.689
## .lnseq 15.830 1.298 12.193 0.000 15.830 0.985
## .piccomp 3.143 0.316 9.937 0.000 3.143 0.578
## .block 27.457 3.220 8.527 0.000 27.457 0.461
## .matrixreason 4.921 0.439 11.216 0.000 4.921 0.751
## .digsym 132.218 10.920 12.108 0.000 132.218 0.957
## .symbolsearch 8.996 0.958 9.393 0.000 8.996 0.527
## verbalcomp 34.667 4.408 7.865 0.000 1.000 1.000
## workingmemory 6.579 1.239 5.309 0.000 1.000 1.000
## perceptorg 2.296 0.407 5.643 0.000 1.000 1.000
##
## R-Square:
## Estimate
## vocab 0.679
## simil 0.440
## inform 0.498
## compreh 0.300
## arith 0.734
## digspan 0.311
## lnseq 0.015
## piccomp 0.422
## block 0.539
## matrixreason 0.249
## digsym 0.043
## symbolsearch 0.473
#Compare the models
anova(wais.fit, wais.fit2)
## Chi Square Difference Test
##
## Df AIC BIC Chisq Chisq diff Df diff Pr(>Chisq)
## wais.fit2 50 19915 20019 212.81
## wais.fit 51 19953 20053 252.81 39.996 1 2.545e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#View the fit indices for the original model
fitmeasures(wais.fit, c("aic", "ecvi"))
## aic ecvi
## 19953.141 1.023
#View the fit indices for the updated model
fitmeasures(wais.fit2, c("aic", "ecvi"))
## aic ecvi
## 19915.144 0.896
#Update the three-factor model to a hierarchical model
wais.model3 <- 'verbalcomp =~ vocab + simil + inform + compreh
workingmemory =~ arith + digspan + lnseq
perceptorg =~ piccomp + block + matrixreason + digsym + symbolsearch
simil ~~ inform
general =~ verbalcomp + workingmemory + perceptorg'
#Analyze the hierarchical model where data is IQdata
wais.fit3 <- cfa(model = wais.model3, data = IQdata)
#Examine the fit indices for the old model
fitmeasures(wais.fit2, c("rmsea", "srmr"))
## rmsea srmr
## 0.104 0.071
#Examine the fit indices for the new model
fitmeasures(wais.fit3, c("rmsea", "srmr"))
## rmsea srmr
## 0.104 0.071
#Update the default picture
semPlot::semPaths(object = wais.fit3, layout = "tree", rotation = 1, whatLabels = "std",
edge.label.cex = 1, what = "std", edge.color = "navy"
)
Chapter 1 - Explore Data
Import data:
Know data:
Count data - broken video that provides some code snippets:
Example code includes:
# Read in "bakeoff.csv" as bakeoff
bakeoff <- readr::read_csv("./RInputFiles/bakeoff.csv")
## Parsed with column specification:
## cols(
## series = col_integer(),
## episode = col_integer(),
## baker = col_character(),
## signature = col_character(),
## technical = col_integer(),
## showstopper = col_character(),
## result = col_character(),
## uk_airdate = col_date(format = ""),
## us_season = col_integer(),
## us_airdate = col_date(format = "")
## )
# Print bakeoff
bakeoff
## # A tibble: 549 x 10
## series episode baker signature technical showstopper result uk_airdate
## <int> <int> <chr> <chr> <int> <chr> <chr> <date>
## 1 1 1 Anne~ "Light J~ 2 Chocolate ~ IN 2010-08-17
## 2 1 1 David Chocolat~ 3 "Black For~ IN 2010-08-17
## 3 1 1 Edd Caramel ~ 1 <NA> IN 2010-08-17
## 4 1 1 Jasm~ Fresh Ma~ NA <NA> IN 2010-08-17
## 5 1 1 Jona~ Carrot C~ 9 Three-laye~ IN 2010-08-17
## 6 1 1 Loui~ Carrot a~ NA "Never Fai~ IN 2010-08-17
## 7 1 1 Mira~ "Triple ~ 8 "Three Tie~ IN 2010-08-17
## 8 1 1 Ruth "Lemon D~ NA "Classic C~ IN 2010-08-17
## 9 1 1 Lea "Cranber~ 10 "Chocolate~ OUT 2010-08-17
## 10 1 1 Mark Sticky M~ NA Heart-shap~ OUT 2010-08-17
## # ... with 539 more rows, and 2 more variables: us_season <int>,
## # us_airdate <date>
# Data set above is already OK - UNKNOWN are NA in CSV
# Filter rows where showstopper is UNKNOWN
bakeoff %>%
filter(showstopper == "UNKNOWN")
## # A tibble: 0 x 10
## # ... with 10 variables: series <int>, episode <int>, baker <chr>,
## # signature <chr>, technical <int>, showstopper <chr>, result <chr>,
## # uk_airdate <date>, us_season <int>, us_airdate <date>
# Edit to add list of missing values
bakeoff <- read_csv("./RInputFiles/bakeoff.csv", na = c("", "NA", "UNKNOWN"))
## Parsed with column specification:
## cols(
## series = col_integer(),
## episode = col_integer(),
## baker = col_character(),
## signature = col_character(),
## technical = col_integer(),
## showstopper = col_character(),
## result = col_character(),
## uk_airdate = col_date(format = ""),
## us_season = col_integer(),
## us_airdate = col_date(format = "")
## )
# Filter rows where showstopper is NA
bakeoff %>%
filter(is.na(showstopper))
## # A tibble: 21 x 10
## series episode baker signature technical showstopper result uk_airdate
## <int> <int> <chr> <chr> <int> <chr> <chr> <date>
## 1 1 1 Edd Caramel ~ 1 <NA> IN 2010-08-17
## 2 1 1 Jasm~ Fresh Ma~ NA <NA> IN 2010-08-17
## 3 1 6 Mira~ Lemon Cu~ NA <NA> RUNNE~ 2010-09-21
## 4 2 1 Ian Apple an~ 10 <NA> IN 2011-08-16
## 5 2 1 Jason "Lemon M~ 6 <NA> IN 2011-08-16
## 6 2 1 Urva~ Cherry B~ 7 <NA> IN 2011-08-16
## 7 2 1 Yasm~ Cardamom~ 5 <NA> IN 2011-08-16
## 8 2 1 Holly "Cherry ~ 1 <NA> SB 2011-08-16
## 9 2 2 Ben Chorizo,~ 1 <NA> IN 2011-08-23
## 10 2 2 Ian "Stilton~ 2 <NA> IN 2011-08-23
## # ... with 11 more rows, and 2 more variables: us_season <int>,
## # us_airdate <date>
# Edit to filter, group by, and skim
bakeoff %>%
filter(!is.na(us_season)) %>%
group_by(us_season) %>%
skimr::skim()
## Skim summary statistics
## n obs: 302
## n variables: 10
## group variables: us_season
##
## -- Variable type:character ---------------------------------------------------------------------------------------
## us_season variable missing complete n min max empty n_unique
## 1 baker 0 78 78 3 9 0 13
## 1 result 0 78 78 2 9 0 5
## 1 showstopper 0 78 78 5 126 0 78
## 1 signature 0 78 78 10 125 0 78
## 2 baker 0 74 74 4 7 0 12
## 2 result 0 74 74 2 9 0 6
## 2 showstopper 1 73 74 8 82 0 73
## 2 signature 1 73 74 15 107 0 73
## 3 baker 0 75 75 3 6 0 12
## 3 result 0 75 75 2 9 0 5
## 3 showstopper 0 75 75 10 70 0 73
## 3 signature 0 75 75 12 64 0 74
## 4 baker 0 75 75 3 9 0 12
## 4 result 0 75 75 2 9 0 5
## 4 showstopper 0 75 75 5 86 0 74
## 4 signature 0 75 75 12 93 0 75
##
## -- Variable type:Date --------------------------------------------------------------------------------------------
## us_season variable missing complete n min max median
## 1 uk_airdate 0 78 78 2013-08-20 2013-10-22 2013-09-10
## 1 us_airdate 0 78 78 2014-12-28 2015-03-01 2015-01-18
## 2 uk_airdate 0 74 74 2014-08-06 2014-10-08 2014-08-27
## 2 us_airdate 0 74 74 2015-09-06 2015-11-08 2015-09-27
## 3 uk_airdate 0 75 75 2015-08-05 2015-10-07 2015-08-26
## 3 us_airdate 0 75 75 2016-07-01 2016-08-12 2016-07-15
## 4 uk_airdate 0 75 75 2016-08-24 2016-10-26 2016-09-14
## 4 us_airdate 0 75 75 2017-06-16 2017-08-04 2017-06-30
## n_unique
## 10
## 10
## 10
## 10
## 10
## 7
## 10
## 8
##
## -- Variable type:integer -----------------------------------------------------------------------------------------
## us_season variable missing complete n mean sd p0 p25 p50 p75 p100
## 1 episode 0 78 78 4.31 2.66 1 2 4 6 10
## 1 series 0 78 78 4 0 4 4 4 4 4
## 1 technical 0 78 78 5.08 3.19 1 2.25 4.5 7 13
## 2 episode 0 74 74 4.38 2.68 1 2 4 6 10
## 2 series 0 74 74 5 0 5 5 5 5 5
## 2 technical 1 73 74 4.73 2.93 1 2 4 7 12
## 3 episode 0 75 75 4.4 2.67 1 2 4 6 10
## 3 series 0 75 75 6 0 6 6 6 6 6
## 3 technical 0 75 75 4.8 2.92 1 2 4 7 12
## 4 episode 0 75 75 4.4 2.67 1 2 4 6 10
## 4 series 0 75 75 7 0 7 7 7 7 7
## 4 technical 0 75 75 4.8 2.92 1 2 4 7 12
## hist
## <U+2587><U+2583><U+2583><U+2582><U+2582><U+2582><U+2582><U+2582>
## <U+2581><U+2581><U+2581><U+2587><U+2581><U+2581><U+2581><U+2581>
## <U+2587><U+2587><U+2583><U+2585><U+2582><U+2583><U+2581><U+2581>
## <U+2587><U+2583><U+2583><U+2583><U+2582><U+2582><U+2582><U+2582>
## <U+2581><U+2581><U+2581><U+2587><U+2581><U+2581><U+2581><U+2581>
## <U+2587><U+2583><U+2587><U+2583><U+2582><U+2583><U+2581><U+2581>
## <U+2587><U+2583><U+2583><U+2583><U+2582><U+2582><U+2582><U+2582>
## <U+2581><U+2581><U+2581><U+2587><U+2581><U+2581><U+2581><U+2581>
## <U+2587><U+2583><U+2587><U+2583><U+2582><U+2583><U+2581><U+2581>
## <U+2587><U+2583><U+2583><U+2583><U+2582><U+2582><U+2582><U+2582>
## <U+2581><U+2581><U+2581><U+2587><U+2581><U+2581><U+2581><U+2581>
## <U+2587><U+2583><U+2587><U+2583><U+2582><U+2583><U+2581><U+2581>
bakeoff %>%
distinct(result)
## # A tibble: 6 x 1
## result
## <chr>
## 1 IN
## 2 OUT
## 3 RUNNER UP
## 4 WINNER
## 5 SB
## 6 LEFT
# Count rows by distinct results
bakeoff %>%
count(result)
## # A tibble: 6 x 2
## result n
## <chr> <int>
## 1 IN 393
## 2 LEFT 1
## 3 OUT 70
## 4 RUNNER UP 16
## 5 SB 61
## 6 WINNER 8
# Count whether or not star baker
bakeoff %>%
count(result=="SB")
## # A tibble: 2 x 2
## `result == "SB"` n
## <lgl> <int>
## 1 FALSE 488
## 2 TRUE 61
# Count the number of rows by series and episode
bakeoff %>%
count(series, episode)
## # A tibble: 74 x 3
## series episode n
## <int> <int> <int>
## 1 1 1 10
## 2 1 2 8
## 3 1 3 6
## 4 1 4 5
## 5 1 5 4
## 6 1 6 3
## 7 2 1 12
## 8 2 2 11
## 9 2 3 10
## 10 2 4 8
## # ... with 64 more rows
# Add second count by series
bakeoff %>%
count(series, episode) %>%
count(series)
## # A tibble: 8 x 2
## series nn
## <int> <int>
## 1 1 6
## 2 2 8
## 3 3 10
## 4 4 10
## 5 5 10
## 6 6 10
## 7 7 10
## 8 8 10
# Count the number of rows by series and baker
bakers_by_series <-
bakeoff %>%
count(series, baker)
# Print to view
bakers_by_series
## # A tibble: 95 x 3
## series baker n
## <int> <chr> <int>
## 1 1 Annetha 2
## 2 1 David 4
## 3 1 Edd 6
## 4 1 Jasminder 5
## 5 1 Jonathan 3
## 6 1 Lea 1
## 7 1 Louise 2
## 8 1 Mark 1
## 9 1 Miranda 6
## 10 1 Ruth 6
## # ... with 85 more rows
# Count again by series
bakers_by_series %>%
count(series)
## # A tibble: 8 x 2
## series nn
## <int> <int>
## 1 1 10
## 2 2 12
## 3 3 12
## 4 4 13
## 5 5 12
## 6 6 12
## 7 7 12
## 8 8 12
# Count again by baker
bakers_by_series %>%
count(baker, sort=TRUE)
## # A tibble: 86 x 2
## baker nn
## <chr> <int>
## 1 Kate 3
## 2 Ian 2
## 3 James 2
## 4 Louise 2
## 5 Mark 2
## 6 Peter 2
## 7 Robert 2
## 8 Tom 2
## 9 Ali 1
## 10 Alvin 1
## # ... with 76 more rows
ggplot(bakeoff, aes(x=episode)) +
geom_bar() +
facet_wrap(~series)
Chapter 2 - Tame Data
Cast column types:
Recode values:
0 = “other”, .default = “student”)) # 0 will become other, anything else will become student0 = NA_character_, .default = “student”)) # create NA for a specific stringSelect variables:
Tame variable names:
Example code includes:
# NOTE THAT THIS WILL THROW WARNINGS
# Try to cast technical as a number
desserts <- readr::read_csv("./RInputFiles/desserts.csv",
col_types = cols(
technical = col_number())
)
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 7 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 4 technical a number N/A './RInputFiles/desserts.csv' file 2 6 technical a number N/A './RInputFiles/desserts.csv' row 3 8 technical a number N/A './RInputFiles/desserts.csv' col 4 10 technical a number N/A './RInputFiles/desserts.csv' expected 5 34 technical a number N/A './RInputFiles/desserts.csv'
## ... ................. ... .............................................................. ........ .............................................................. ...... .............................................................. .... .............................................................. ... .............................................................. ... .............................................................. ........ ..............................................................
## See problems(...) for more details.
# View parsing problems
readr::problems(desserts)
## # A tibble: 7 x 5
## row col expected actual file
## <int> <chr> <chr> <chr> <chr>
## 1 4 technical a number N/A './RInputFiles/desserts.csv'
## 2 6 technical a number N/A './RInputFiles/desserts.csv'
## 3 8 technical a number N/A './RInputFiles/desserts.csv'
## 4 10 technical a number N/A './RInputFiles/desserts.csv'
## 5 34 technical a number N/A './RInputFiles/desserts.csv'
## 6 35 technical a number N/A './RInputFiles/desserts.csv'
## 7 36 technical a number N/A './RInputFiles/desserts.csv'
# NOTE THAT THIS WILL FIX THE ERRORS
# Edit code to fix the parsing error
desserts <- readr::read_csv("./RInputFiles/desserts.csv",
col_types = cols(
technical = col_number()),
na = c("", "NA", "N/A")
)
# View parsing problems
readr::problems(desserts)
## # tibble [0 x 4]
## # ... with 4 variables: row <int>, col <int>, expected <chr>, actual <chr>
# Find format to parse uk_airdate
readr::parse_date("17 August 2010", format = "%d %B %Y")
## [1] "2010-08-17"
# Edit to cast uk_airdate
desserts <- readr::read_csv("./RInputFiles/desserts.csv",
na = c("", "NA", "N/A"),
col_types = cols(
technical = col_number(),
uk_airdate = col_date("%d %B %Y")
))
# Print by descending uk_airdate
desserts %>%
arrange(desc(uk_airdate))
## # A tibble: 549 x 16
## series episode baker technical result uk_airdate us_season us_airdate
## <int> <int> <chr> <dbl> <chr> <date> <int> <date>
## 1 8 10 Kate 3 RUNNE~ 2017-10-31 NA NA
## 2 8 10 Stev~ 1 RUNNE~ 2017-10-31 NA NA
## 3 8 10 Soph~ 2 WINNER 2017-10-31 NA NA
## 4 8 9 Kate 4 IN 2017-10-24 NA NA
## 5 8 9 Stev~ 3 IN 2017-10-24 NA NA
## 6 8 9 Stac~ 2 OUT 2017-10-24 NA NA
## 7 8 9 Soph~ 1 SB 2017-10-24 NA NA
## 8 8 8 Kate 2 IN 2017-10-17 NA NA
## 9 8 8 Soph~ 4 IN 2017-10-17 NA NA
## 10 8 8 Stev~ 1 IN 2017-10-17 NA NA
## # ... with 539 more rows, and 8 more variables:
## # showstopper_chocolate <chr>, showstopper_dessert <chr>,
## # showstopper_fruit <chr>, showstopper_nut <chr>,
## # signature_chocolate <chr>, signature_dessert <chr>,
## # signature_fruit <chr>, signature_nut <chr>
# Cast result a factor
desserts <- readr::read_csv("./RInputFiles/desserts.csv",
na = c("", "NA", "N/A"),
col_types = cols(
technical = col_number(),
uk_airdate = col_date(format = "%d %B %Y"),
result = col_factor(levels=NULL)
))
# Glimpse to view
glimpse(desserts)
## Observations: 549
## Variables: 16
## $ series <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ episode <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2...
## $ baker <chr> "Annetha", "David", "Edd", "Jasminder", ...
## $ technical <dbl> 2, 3, 1, NA, 9, NA, 8, NA, 10, NA, 8, 6,...
## $ result <fct> IN, IN, IN, IN, IN, IN, IN, IN, OUT, OUT...
## $ uk_airdate <date> 2010-08-17, 2010-08-17, 2010-08-17, 201...
## $ us_season <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ us_airdate <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ showstopper_chocolate <chr> "chocolate", "chocolate", "no chocolate"...
## $ showstopper_dessert <chr> "other", "other", "other", "other", "oth...
## $ showstopper_fruit <chr> "no fruit", "no fruit", "no fruit", "no ...
## $ showstopper_nut <chr> "no nut", "no nut", "no nut", "no nut", ...
## $ signature_chocolate <chr> "no chocolate", "chocolate", "no chocola...
## $ signature_dessert <chr> "cake", "cake", "cake", "cake", "cake", ...
## $ signature_fruit <chr> "no fruit", "fruit", "fruit", "fruit", "...
## $ signature_nut <chr> "no nut", "no nut", "no nut", "no nut", ...
oldDesserts <- desserts
tempDesserts <- desserts %>%
gather(key="type_ing", value="status", starts_with(c("showstopper")), starts_with(c("signature"))) %>%
separate(type_ing, into=c("challenge", "ingredient"), sep="_") %>%
spread(ingredient, status)
glimpse(tempDesserts)
## Observations: 1,098
## Variables: 13
## $ series <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ episode <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, ...
## $ baker <chr> "Annetha", "David", "Edd", "Jasminder", "Jonathan",...
## $ technical <dbl> 2, 3, 1, NA, 9, NA, 8, NA, 10, NA, 8, 6, 2, 1, 3, 5...
## $ result <fct> IN, IN, IN, IN, IN, IN, IN, IN, OUT, OUT, IN, IN, I...
## $ uk_airdate <date> 2010-08-17, 2010-08-17, 2010-08-17, 2010-08-17, 20...
## $ us_season <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ us_airdate <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ challenge <chr> "showstopper", "showstopper", "showstopper", "shows...
## $ chocolate <chr> "chocolate", "chocolate", "no chocolate", "no choco...
## $ dessert <chr> "other", "other", "other", "other", "other", "cake"...
## $ fruit <chr> "no fruit", "no fruit", "no fruit", "no fruit", "fr...
## $ nut <chr> "no nut", "no nut", "no nut", "no nut", "almond", "...
desserts <- tempDesserts
# Count rows grouping by nut variable
desserts %>%
count(nut, sort=TRUE)
## # A tibble: 8 x 2
## nut n
## <chr> <int>
## 1 no nut 944
## 2 almond 35
## 3 walnut 35
## 4 pistachio 29
## 5 filbert 23
## 6 pecan 14
## 7 multiple 9
## 8 peanut 9
# Recode filberts as hazelnuts
desserts <- desserts %>%
mutate(nut = recode(nut, "filbert" = "hazelnut"))
# Count rows again
desserts %>%
count(nut, sort = TRUE)
## # A tibble: 8 x 2
## nut n
## <chr> <int>
## 1 no nut 944
## 2 almond 35
## 3 walnut 35
## 4 pistachio 29
## 5 hazelnut 23
## 6 pecan 14
## 7 multiple 9
## 8 peanut 9
# Edit code to recode "no nut" as missing
desserts <- desserts %>%
mutate(nut = recode(nut, "filbert" = "hazelnut",
"no nut" = NA_character_))
# Count rows again
desserts %>%
count(nut, sort = TRUE)
## # A tibble: 8 x 2
## nut n
## <chr> <int>
## 1 <NA> 944
## 2 almond 35
## 3 walnut 35
## 4 pistachio 29
## 5 hazelnut 23
## 6 pecan 14
## 7 multiple 9
## 8 peanut 9
# Edit to recode tech_win as factor
desserts <- desserts %>%
mutate(tech_win = recode_factor(technical, `1` = 1,
.default = 0))
# Count to compare values
desserts %>%
count(technical == 1, tech_win)
## # A tibble: 3 x 3
## `technical == 1` tech_win n
## <lgl> <fct> <int>
## 1 FALSE 0 936
## 2 TRUE 1 146
## 3 NA <NA> 16
ratings0 <- readr::read_csv("./RInputFiles/02.03_messy_ratings.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## series = col_integer(),
## episodes = col_integer(),
## premiere = col_character(),
## finale = col_character(),
## winner = col_character(),
## day_of_week = col_character(),
## timeslot = col_time(format = ""),
## channel = col_character(),
## runner_up_1 = col_character(),
## runner_up_2 = col_character(),
## season = col_integer(),
## season_premiere = col_character(),
## season_finale = col_character(),
## e1_uk_airdate = col_character(),
## e2_uk_airdate = col_character(),
## e3_uk_airdate = col_character(),
## e4_uk_airdate = col_character(),
## e5_uk_airdate = col_character(),
## e6_uk_airdate = col_character(),
## e7_uk_airdate = col_character()
## # ... with 3 more columns
## )
## See spec(...) for full column specifications.
str(ratings0, give.attr=FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 8 obs. of 44 variables:
## $ series : int 1 2 3 4 5 6 7 8
## $ episodes : int 6 8 10 10 10 10 10 10
## $ premiere : chr "17-Aug-10" "14-Aug-11" "14-Aug-12" "20-Aug-13" ...
## $ finale : chr "21-Sep-10" "4-Oct-11" "16-Oct-12" "22-Oct-13" ...
## $ winner : chr "Edd Kimber" "Joanne Wheatley" "John Whaite" "Frances Quinn" ...
## $ avg_uk_viewers : num 2.77 4 5 7.35 10.04 ...
## $ day_of_week : chr "Tuesday" "Tuesday" "Tuesday" "Tuesday" ...
## $ timeslot : 'hms' num 20:00:00 20:00:00 20:00:00 20:00:00 ...
## $ channel : chr "BBC Two" "BBC Two" "BBC Two" "BBC Two" ...
## $ runner_up_1 : chr "Miranda Gore Browne" "Holly Bell" "Brendan Lynch" "Kimberley Wilson" ...
## $ runner_up_2 : chr "Ruth Clemens" "Mary-Anne Boermans" "James Morton" "Ruby Tandoh" ...
## $ season : int NA NA NA 1 2 3 4 NA
## $ season_premiere : chr NA NA NA "12/28/14" ...
## $ season_finale : chr NA NA NA "3/1/15" ...
## $ e1_viewers_7day : num 2.24 3.1 3.85 6.6 8.51 ...
## $ e1_viewers_28day : num NA NA NA NA NA ...
## $ e2_viewers_7day : num 3 3.53 4.6 6.65 8.79 ...
## $ e2_viewers_28day : num NA NA NA NA NA ...
## $ e3_viewers_7day : num 3 3.82 4.53 7.17 9.28 ...
## $ e3_viewers_28day : num NA NA NA NA NA ...
## $ e4_viewers_7day : num 2.6 3.6 4.71 6.82 10.25 ...
## $ e4_viewers_28day : num NA NA NA NA NA ...
## $ e5_viewers_7day : num 3.03 3.83 4.61 6.95 9.95 ...
## $ e5_viewers_28day : num NA NA NA NA NA ...
## $ e6_viewers_7day : num 2.75 4.25 4.82 7.32 10.13 ...
## $ e6_viewers_28day : num NA NA NA NA NA ...
## $ e7_viewers_7day : num NA 4.42 5.1 7.76 10.28 ...
## $ e7_viewers_28day : num NA NA NA NA NA ...
## $ e8_viewers_7day : num NA 5.06 5.35 7.41 9.02 ...
## $ e8_viewers_28day : num NA NA NA NA NA ...
## $ e9_viewers_7day : num NA NA 5.7 7.41 10.67 ...
## $ e9_viewers_28day : num NA NA NA NA NA ...
## $ e10_viewers_7day : num NA NA 6.74 9.45 13.51 ...
## $ e10_viewers_28day: num NA NA NA NA NA ...
## $ e1_uk_airdate : chr "8/17/10" "8/16/11" "8/14/12" "8/20/13" ...
## $ e2_uk_airdate : chr "8/24/10" "8/23/11" "8/21/12" "8/27/13" ...
## $ e3_uk_airdate : chr "8/31/10" "8/30/11" "8/28/12" "9/3/13" ...
## $ e4_uk_airdate : chr "9/7/10" "9/6/11" "9/4/12" "9/10/13" ...
## $ e5_uk_airdate : chr "9/14/10" "9/13/11" "9/11/12" "9/17/13" ...
## $ e6_uk_airdate : chr "9/21/10" "9/20/11" "9/18/12" "9/24/13" ...
## $ e7_uk_airdate : chr NA "9/27/11" "9/25/12" "10/1/13" ...
## $ e8_uk_airdate : chr NA "10/4/11" "10/2/12" "10/8/13" ...
## $ e9_uk_airdate : chr NA NA "10/9/12" "10/15/13" ...
## $ e10_uk_airdate : chr NA NA "10/16/12" "10/22/13" ...
ratings <- ratings0 %>%
filter(series >= 3) %>%
rename(day=day_of_week) %>%
mutate(series=factor(series),
season_premiere=lubridate::mdy(season_premiere),
season_finale=lubridate::mdy(season_finale),
viewer_growth = (e10_viewers_7day - e1_viewers_7day)
) %>%
select(-contains("uk_airdate"))
# Recode channel as dummy: bbc (1) or not (0)
ratings <- ratings %>%
mutate(bbc = recode_factor(channel, "Channel 4"=0, .default=1))
# Look at the variables to plot next
ratings %>% select(series, channel, bbc, viewer_growth)
## # A tibble: 6 x 4
## series channel bbc viewer_growth
## <fct> <chr> <fct> <dbl>
## 1 3 BBC Two 1 2.89
## 2 4 BBC Two 1 2.85
## 3 5 BBC One 1 5
## 4 6 BBC One 1 3.43
## 5 7 BBC One 1 2.32
## 6 8 Channel 4 0 0.580
# Make a filled bar chart
ggplot(ratings, aes(x = series, y = viewer_growth, fill = bbc)) +
geom_col()
# Move channel to first column
ratings %>%
select(channel, everything())
## # A tibble: 6 x 36
## channel series episodes premiere finale winner avg_uk_viewers day
## <chr> <fct> <int> <chr> <chr> <chr> <dbl> <chr>
## 1 BBC Two 3 10 14-Aug-~ 16-Oc~ John ~ 5 Tues~
## 2 BBC Two 4 10 20-Aug-~ 22-Oc~ Franc~ 7.35 Tues~
## 3 BBC One 5 10 6-Aug-14 8-Oct~ Nancy~ 10.0 Wedn~
## 4 BBC One 6 10 5-Aug-15 7-Oct~ Nadiy~ 12.5 Wedn~
## 5 BBC One 7 10 24-Aug-~ 26-Oc~ Candi~ 13.8 Wedn~
## 6 Channe~ 8 10 29-Aug-~ 31-Oc~ Sophi~ 9.29 Tues~
## # ... with 28 more variables: timeslot <time>, runner_up_1 <chr>,
## # runner_up_2 <chr>, season <int>, season_premiere <date>,
## # season_finale <date>, e1_viewers_7day <dbl>, e1_viewers_28day <dbl>,
## # e2_viewers_7day <dbl>, e2_viewers_28day <dbl>, e3_viewers_7day <dbl>,
## # e3_viewers_28day <dbl>, e4_viewers_7day <dbl>, e4_viewers_28day <dbl>,
## # e5_viewers_7day <dbl>, e5_viewers_28day <dbl>, e6_viewers_7day <dbl>,
## # e6_viewers_28day <dbl>, e7_viewers_7day <dbl>, e7_viewers_28day <dbl>,
## # e8_viewers_7day <dbl>, e8_viewers_28day <dbl>, e9_viewers_7day <dbl>,
## # e9_viewers_28day <dbl>, e10_viewers_7day <dbl>,
## # e10_viewers_28day <dbl>, viewer_growth <dbl>, bbc <fct>
# Edit to drop 7- and 28-day episode viewer data
ratings %>%
select(-ends_with("day"))
## # A tibble: 6 x 15
## series episodes premiere finale winner avg_uk_viewers timeslot channel
## <fct> <int> <chr> <chr> <chr> <dbl> <time> <chr>
## 1 3 10 14-Aug-~ 16-Oc~ John ~ 5 20:00 BBC Two
## 2 4 10 20-Aug-~ 22-Oc~ Franc~ 7.35 20:00 BBC Two
## 3 5 10 6-Aug-14 8-Oct~ Nancy~ 10.0 20:00 BBC One
## 4 6 10 5-Aug-15 7-Oct~ Nadiy~ 12.5 20:00 BBC One
## 5 7 10 24-Aug-~ 26-Oc~ Candi~ 13.8 20:00 BBC One
## 6 8 10 29-Aug-~ 31-Oc~ Sophi~ 9.29 20:00 Channe~
## # ... with 7 more variables: runner_up_1 <chr>, runner_up_2 <chr>,
## # season <int>, season_premiere <date>, season_finale <date>,
## # viewer_growth <dbl>, bbc <fct>
# Edit to move channel to first and drop episode viewer data
ratings %>%
select(-ends_with("day")) %>%
select(channel, everything())
## # A tibble: 6 x 15
## channel series episodes premiere finale winner avg_uk_viewers timeslot
## <chr> <fct> <int> <chr> <chr> <chr> <dbl> <time>
## 1 BBC Two 3 10 14-Aug-~ 16-Oc~ John ~ 5 20:00
## 2 BBC Two 4 10 20-Aug-~ 22-Oc~ Franc~ 7.35 20:00
## 3 BBC One 5 10 6-Aug-14 8-Oct~ Nancy~ 10.0 20:00
## 4 BBC One 6 10 5-Aug-15 7-Oct~ Nadiy~ 12.5 20:00
## 5 BBC One 7 10 24-Aug-~ 26-Oc~ Candi~ 13.8 20:00
## 6 Channe~ 8 10 29-Aug-~ 31-Oc~ Sophi~ 9.29 20:00
## # ... with 7 more variables: runner_up_1 <chr>, runner_up_2 <chr>,
## # season <int>, season_premiere <date>, season_finale <date>,
## # viewer_growth <dbl>, bbc <fct>
# Glimpse messy names
# glimpse(messy_ratings)
# Reformat to lower camelcase
# ratings <- messy_ratings %>%
# clean_names(case="lower_camel")
# Glimpse cleaned names
# glimpse(ratings)
# Reformat to snake case
# ratings <- messy_ratings %>%
# clean_names("snake")
# Glimpse cleaned names
# glimpse(ratings)
# Select 7-day viewer data by series
viewers_7day <- ratings %>%
select(series, contains("7day"))
# Glimpse
glimpse(viewers_7day)
## Observations: 6
## Variables: 11
## $ series <fct> 3, 4, 5, 6, 7, 8
## $ e1_viewers_7day <dbl> 3.85, 6.60, 8.51, 11.62, 13.58, 9.46
## $ e2_viewers_7day <dbl> 4.60, 6.65, 8.79, 11.59, 13.45, 9.23
## $ e3_viewers_7day <dbl> 4.53, 7.17, 9.28, 12.01, 13.01, 8.68
## $ e4_viewers_7day <dbl> 4.71, 6.82, 10.25, 12.36, 13.29, 8.55
## $ e5_viewers_7day <dbl> 4.61, 6.95, 9.95, 12.39, 13.12, 8.61
## $ e6_viewers_7day <dbl> 4.82, 7.32, 10.13, 12.00, 13.13, 8.61
## $ e7_viewers_7day <dbl> 5.10, 7.76, 10.28, 12.35, 13.45, 9.01
## $ e8_viewers_7day <dbl> 5.350, 7.410, 9.023, 11.090, 13.260, 8.950
## $ e9_viewers_7day <dbl> 5.70, 7.41, 10.67, 12.65, 13.44, 9.03
## $ e10_viewers_7day <dbl> 6.74, 9.45, 13.51, 15.05, 15.90, 10.04
# Adapt code to also rename 7-day viewer data
viewers_7day <- ratings %>%
select(series, viewers_7day_ = ends_with("7day"))
# Glimpse
glimpse(viewers_7day)
## Observations: 6
## Variables: 11
## $ series <fct> 3, 4, 5, 6, 7, 8
## $ viewers_7day_1 <dbl> 3.85, 6.60, 8.51, 11.62, 13.58, 9.46
## $ viewers_7day_2 <dbl> 4.60, 6.65, 8.79, 11.59, 13.45, 9.23
## $ viewers_7day_3 <dbl> 4.53, 7.17, 9.28, 12.01, 13.01, 8.68
## $ viewers_7day_4 <dbl> 4.71, 6.82, 10.25, 12.36, 13.29, 8.55
## $ viewers_7day_5 <dbl> 4.61, 6.95, 9.95, 12.39, 13.12, 8.61
## $ viewers_7day_6 <dbl> 4.82, 7.32, 10.13, 12.00, 13.13, 8.61
## $ viewers_7day_7 <dbl> 5.10, 7.76, 10.28, 12.35, 13.45, 9.01
## $ viewers_7day_8 <dbl> 5.350, 7.410, 9.023, 11.090, 13.260, 8.950
## $ viewers_7day_9 <dbl> 5.70, 7.41, 10.67, 12.65, 13.44, 9.03
## $ viewers_7day_10 <dbl> 6.74, 9.45, 13.51, 15.05, 15.90, 10.04
# Adapt code to drop 28-day columns; move 7-day to front
viewers_7day <- ratings %>%
select(viewers_7day_ = ends_with("7day"), everything(), -contains("28day"))
# Glimpse
glimpse(viewers_7day)
## Observations: 6
## Variables: 26
## $ viewers_7day_1 <dbl> 3.85, 6.60, 8.51, 11.62, 13.58, 9.46
## $ viewers_7day_2 <dbl> 4.60, 6.65, 8.79, 11.59, 13.45, 9.23
## $ viewers_7day_3 <dbl> 4.53, 7.17, 9.28, 12.01, 13.01, 8.68
## $ viewers_7day_4 <dbl> 4.71, 6.82, 10.25, 12.36, 13.29, 8.55
## $ viewers_7day_5 <dbl> 4.61, 6.95, 9.95, 12.39, 13.12, 8.61
## $ viewers_7day_6 <dbl> 4.82, 7.32, 10.13, 12.00, 13.13, 8.61
## $ viewers_7day_7 <dbl> 5.10, 7.76, 10.28, 12.35, 13.45, 9.01
## $ viewers_7day_8 <dbl> 5.350, 7.410, 9.023, 11.090, 13.260, 8.950
## $ viewers_7day_9 <dbl> 5.70, 7.41, 10.67, 12.65, 13.44, 9.03
## $ viewers_7day_10 <dbl> 6.74, 9.45, 13.51, 15.05, 15.90, 10.04
## $ series <fct> 3, 4, 5, 6, 7, 8
## $ episodes <int> 10, 10, 10, 10, 10, 10
## $ premiere <chr> "14-Aug-12", "20-Aug-13", "6-Aug-14", "5-Aug-1...
## $ finale <chr> "16-Oct-12", "22-Oct-13", "8-Oct-14", "7-Oct-1...
## $ winner <chr> "John Whaite", "Frances Quinn", "Nancy Birtwhi...
## $ avg_uk_viewers <dbl> 5.00, 7.35, 10.04, 12.50, 13.85, 9.29
## $ day <chr> "Tuesday", "Tuesday", "Wednesday", "Wednesday"...
## $ timeslot <time> 20:00:00, 20:00:00, 20:00:00, 20:00:00, 20:00...
## $ channel <chr> "BBC Two", "BBC Two", "BBC One", "BBC One", "B...
## $ runner_up_1 <chr> "Brendan Lynch", "Kimberley Wilson", "Luis Tro...
## $ runner_up_2 <chr> "James Morton", "Ruby Tandoh", "Richard Burr",...
## $ season <int> NA, 1, 2, 3, 4, NA
## $ season_premiere <date> NA, 2014-12-28, 2015-09-06, 2016-07-01, 2017-...
## $ season_finale <date> NA, 2015-03-01, 2015-11-08, 2016-08-12, 2017-...
## $ viewer_growth <dbl> 2.89, 2.85, 5.00, 3.43, 2.32, 0.58
## $ bbc <fct> 1, 1, 1, 1, 1, 0
# Adapt code to keep original order
viewers_7day <- ratings %>%
select(everything(), -ends_with("28day"), viewers_7day_ = ends_with("7day"))
# Glimpse
glimpse(viewers_7day)
## Observations: 6
## Variables: 26
## $ series <fct> 3, 4, 5, 6, 7, 8
## $ episodes <int> 10, 10, 10, 10, 10, 10
## $ premiere <chr> "14-Aug-12", "20-Aug-13", "6-Aug-14", "5-Aug-1...
## $ finale <chr> "16-Oct-12", "22-Oct-13", "8-Oct-14", "7-Oct-1...
## $ winner <chr> "John Whaite", "Frances Quinn", "Nancy Birtwhi...
## $ avg_uk_viewers <dbl> 5.00, 7.35, 10.04, 12.50, 13.85, 9.29
## $ day <chr> "Tuesday", "Tuesday", "Wednesday", "Wednesday"...
## $ timeslot <time> 20:00:00, 20:00:00, 20:00:00, 20:00:00, 20:00...
## $ channel <chr> "BBC Two", "BBC Two", "BBC One", "BBC One", "B...
## $ runner_up_1 <chr> "Brendan Lynch", "Kimberley Wilson", "Luis Tro...
## $ runner_up_2 <chr> "James Morton", "Ruby Tandoh", "Richard Burr",...
## $ season <int> NA, 1, 2, 3, 4, NA
## $ season_premiere <date> NA, 2014-12-28, 2015-09-06, 2016-07-01, 2017-...
## $ season_finale <date> NA, 2015-03-01, 2015-11-08, 2016-08-12, 2017-...
## $ viewers_7day_1 <dbl> 3.85, 6.60, 8.51, 11.62, 13.58, 9.46
## $ viewers_7day_2 <dbl> 4.60, 6.65, 8.79, 11.59, 13.45, 9.23
## $ viewers_7day_3 <dbl> 4.53, 7.17, 9.28, 12.01, 13.01, 8.68
## $ viewers_7day_4 <dbl> 4.71, 6.82, 10.25, 12.36, 13.29, 8.55
## $ viewers_7day_5 <dbl> 4.61, 6.95, 9.95, 12.39, 13.12, 8.61
## $ viewers_7day_6 <dbl> 4.82, 7.32, 10.13, 12.00, 13.13, 8.61
## $ viewers_7day_7 <dbl> 5.10, 7.76, 10.28, 12.35, 13.45, 9.01
## $ viewers_7day_8 <dbl> 5.350, 7.410, 9.023, 11.090, 13.260, 8.950
## $ viewers_7day_9 <dbl> 5.70, 7.41, 10.67, 12.65, 13.44, 9.03
## $ viewers_7day_10 <dbl> 6.74, 9.45, 13.51, 15.05, 15.90, 10.04
## $ viewer_growth <dbl> 2.89, 2.85, 5.00, 3.43, 2.32, 0.58
## $ bbc <fct> 1, 1, 1, 1, 1, 0
Chapter 3 - Tidy Your Data
Introduction to Tidy Data:
Gather:
Separate:
Spread:
Tidy multiple sets of data:
Example code includes:
ratings1 <- readr::read_csv("./RInputFiles/messy_ratings.csv")
## Parsed with column specification:
## cols(
## series = col_integer(),
## e1 = col_double(),
## e2 = col_double(),
## e3 = col_double(),
## e4 = col_double(),
## e5 = col_double(),
## e6 = col_double(),
## e7 = col_double(),
## e8 = col_double(),
## e9 = col_double(),
## e10 = col_double()
## )
oldRatings <- ratings
ratings <- ratings1
ratings1
## # A tibble: 8 x 11
## series e1 e2 e3 e4 e5 e6 e7 e8 e9 e10
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2.24 3 3 2.6 3.03 2.75 NA NA NA NA
## 2 2 3.1 3.53 3.82 3.6 3.83 4.25 4.42 5.06 NA NA
## 3 3 3.85 4.6 4.53 4.71 4.61 4.82 5.1 5.35 5.7 6.74
## 4 4 6.6 6.65 7.17 6.82 6.95 7.32 7.76 7.41 7.41 9.45
## 5 5 8.51 8.79 9.28 10.2 9.95 10.1 10.3 9.02 10.7 13.5
## 6 6 11.6 11.6 12.0 12.4 12.4 12 12.4 11.1 12.6 15.0
## 7 7 13.6 13.4 13.0 13.3 13.1 13.1 13.4 13.3 13.4 15.9
## 8 8 9.46 9.23 8.68 8.55 8.61 8.61 9.01 8.95 9.03 10.0
# Plot of episode 1 viewers by series
ratings %>%
ggplot(aes(x=series, y=e1)) +
geom_bar(stat="identity")
# Adapt code to plot episode 2 viewers by series
ggplot(ratings, aes(x = series, y = e2)) +
geom_col()
# Gather and count episodes
tidy_ratings <- ratings %>%
gather(key = "episode", value = "viewers_7day", -series,
factor_key = TRUE, na.rm = TRUE) %>%
arrange(series, episode) %>%
mutate(episode_count = row_number())
# Plot viewers by episode and series
ggplot(tidy_ratings, aes(x = episode_count, y = viewers_7day, fill = as.factor(series))) +
geom_col()
ratings2 <- readr::read_csv("./RInputFiles/messy_ratings2.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## series = col_integer()
## )
## See spec(...) for full column specifications.
ratings2$series <- as.factor(ratings2$series)
ratings2
## # A tibble: 8 x 21
## series e1_7day e1_28day e2_7day e2_28day e3_7day e3_28day e4_7day
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2.24 NA 3 NA 3 NA 2.6
## 2 2 3.1 NA 3.53 NA 3.82 NA 3.6
## 3 3 3.85 NA 4.6 NA 4.53 NA 4.71
## 4 4 6.6 NA 6.65 NA 7.17 NA 6.82
## 5 5 8.51 NA 8.79 NA 9.28 NA 10.2
## 6 6 11.6 11.7 11.6 11.8 12.0 NA 12.4
## 7 7 13.6 13.9 13.4 13.7 13.0 13.4 13.3
## 8 8 9.46 9.72 9.23 9.53 8.68 9.06 8.55
## # ... with 13 more variables: e4_28day <dbl>, e5_7day <dbl>,
## # e5_28day <dbl>, e6_7day <dbl>, e6_28day <dbl>, e7_7day <dbl>,
## # e7_28day <dbl>, e8_7day <dbl>, e8_28day <dbl>, e9_7day <dbl>,
## # e9_28day <dbl>, e10_7day <dbl>, e10_28day <dbl>
# Gather 7-day viewers by episode (ratings2 already loaded)
week_ratings <- ratings2 %>%
select(series, ends_with("7day")) %>%
gather(episode, viewers_7day, ends_with("7day"), na.rm = TRUE, factor_key = TRUE)
# Plot 7-day viewers by episode and series
ggplot(week_ratings, aes(x = episode, y = viewers_7day, group = series)) +
geom_line() +
facet_wrap(~series)
# Edit to parse episode number
week_ratings <- ratings2 %>%
select(series, ends_with("7day")) %>%
gather(episode, viewers_7day, ends_with("7day"), na.rm = TRUE) %>%
separate(episode, into = "episode", extra = "drop") %>%
mutate(episode = parse_number(episode))
# Edit your code to color by series and add a theme
ggplot(week_ratings, aes(x = episode, y = viewers_7day,
group = series, color = series)) +
geom_line() +
facet_wrap(~series) +
guides(color = FALSE) +
theme_minimal()
week_ratings_dec <- week_ratings %>%
mutate(viewers_7day=as.character(viewers_7day)) %>%
separate(viewers_7day, into=c("viewers_millions", "viewers_decimal"), sep="\\.") %>%
mutate(viewers_decimal=ifelse(is.na(viewers_decimal), ".", paste0(".", viewers_decimal))) %>%
dplyr::arrange(series, episode)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 3 rows [9,
## 17, 46].
# Unite series and episode
ratings3 <- week_ratings_dec %>%
unite("viewers_7day", viewers_millions, viewers_decimal)
# Print to view
ratings3
## # A tibble: 74 x 3
## series episode viewers_7day
## <fct> <dbl> <chr>
## 1 1 1 2_.24
## 2 1 2 3_.
## 3 1 3 3_.
## 4 1 4 2_.6
## 5 1 5 3_.03
## 6 1 6 2_.75
## 7 2 1 3_.1
## 8 2 2 3_.53
## 9 2 3 3_.82
## 10 2 4 3_.6
## # ... with 64 more rows
# Adapt to change the separator
ratings3 <- week_ratings_dec %>%
unite(viewers_7day, viewers_millions, viewers_decimal, sep="")
# Print to view
ratings3
## # A tibble: 74 x 3
## series episode viewers_7day
## <fct> <dbl> <chr>
## 1 1 1 2.24
## 2 1 2 3.
## 3 1 3 3.
## 4 1 4 2.6
## 5 1 5 3.03
## 6 1 6 2.75
## 7 2 1 3.1
## 8 2 2 3.53
## 9 2 3 3.82
## 10 2 4 3.6
## # ... with 64 more rows
# Adapt to cast viewers as a number
ratings3 <- week_ratings_dec %>%
unite(viewers_7day, viewers_millions, viewers_decimal, sep="") %>%
mutate(viewers_7day = parse_number(viewers_7day))
# Print to view
ratings3
## # A tibble: 74 x 3
## series episode viewers_7day
## <fct> <dbl> <dbl>
## 1 1 1 2.24
## 2 1 2 3
## 3 1 3 3
## 4 1 4 2.6
## 5 1 5 3.03
## 6 1 6 2.75
## 7 2 1 3.1
## 8 2 2 3.53
## 9 2 3 3.82
## 10 2 4 3.6
## # ... with 64 more rows
# Create tidy data with 7- and 28-day viewers
tidy_ratings_all <- ratings2 %>%
gather(episode, viewers, ends_with("day"), na.rm = TRUE) %>%
separate(episode, into = c("episode", "days")) %>%
mutate(episode = parse_number(episode),
days = parse_number(days))
# Adapt to spread counted values
tidy_ratings_all %>%
count(series, days, wt = viewers) %>%
spread(key=days, value=n, sep="_")
## # A tibble: 8 x 3
## series days_7 days_28
## <fct> <dbl> <dbl>
## 1 1 16.6 NA
## 2 2 31.6 NA
## 3 3 50.0 NA
## 4 4 73.5 NA
## 5 5 100. NA
## 6 6 123. 113
## 7 7 136. 138.
## 8 8 90.2 92.9
# Fill in blanks to get premiere/finale data
tidy_ratings <- ratings %>%
gather(episode, viewers, -series, na.rm = TRUE) %>%
mutate(episode = parse_number(episode)) %>%
group_by(series) %>%
filter(episode == 1 | episode == max(episode)) %>%
ungroup()
# Recode first/last episodes
first_last <- tidy_ratings %>%
mutate(episode = recode(episode, `1` = "first", .default = "last"))
# Fill in to make slope chart
ggplot(first_last, aes(x = episode, y = viewers, color = as.factor(series))) +
geom_point() +
geom_line(aes(group = series))
# Switch the variables mapping x-axis and color
ggplot(first_last, aes(x = series, y = viewers, color = episode)) +
geom_point() + # keep
geom_line(aes(group = series)) + # keep
coord_flip() # keep
# Calculate relative increase in viewers
bump_by_series <- first_last %>%
spread(episode, viewers) %>%
mutate(bump = (last - first) / first)
# Fill in to make bar chart of bumps by series
ggplot(bump_by_series, aes(x = series, y = bump)) +
geom_col() +
scale_y_continuous(labels = scales::percent) # converts to %
Chapter 4 - Transform Your Data
Complex recoding with case_when:
Factors:
Dates:
Strings:
Final thoughts:
Example code includes:
baker_results <- readr::read_csv("./RInputFiles/baker_results.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## baker_full = col_character(),
## baker = col_character(),
## occupation = col_character(),
## hometown = col_character(),
## baker_last = col_character(),
## baker_first = col_character(),
## technical_median = col_double(),
## first_date_appeared = col_date(format = ""),
## last_date_appeared = col_date(format = ""),
## first_date_us = col_date(format = ""),
## last_date_us = col_date(format = ""),
## percent_episodes_appeared = col_double(),
## percent_technical_top3 = col_double()
## )
## See spec(...) for full column specifications.
messy_baker_results <- readr::read_csv("./RInputFiles/messy_baker_results.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## series = col_integer(),
## star_baker = col_integer(),
## technical_winner = col_integer(),
## technical_top3 = col_integer(),
## technical_bottom = col_integer(),
## technical_highest = col_integer(),
## technical_lowest = col_integer(),
## technical_median = col_double(),
## series_winner = col_integer(),
## series_runner_up = col_integer(),
## total_episodes_appeared = col_integer(),
## percent_episodes_appeared = col_double(),
## percent_technical_top3 = col_double(),
## first_date_appeared_uk = col_date(format = ""),
## last_date_appeared_uk = col_date(format = ""),
## first_date_us = col_date(format = ""),
## last_date_us = col_date(format = ""),
## e_1_technical = col_integer(),
## e_10_technical = col_integer(),
## e_2_technical = col_integer()
## # ... with 7 more columns
## )
## See spec(...) for full column specifications.
bakers <- baker_results
glimpse(bakers)
## Observations: 95
## Variables: 24
## $ series <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, ...
## $ baker_full <chr> "Annetha Mills", "David Chambers", "...
## $ baker <chr> "Annetha", "David", "Edd", "Jasminde...
## $ age <int> 30, 31, 24, 45, 25, 51, 44, 48, 37, ...
## $ occupation <chr> "Single mother", "Entrepreneur", "De...
## $ hometown <chr> "Essex", "Milton Keynes", "Bradford"...
## $ baker_last <chr> "Mills", "Chambers", "Kimber", "Rand...
## $ baker_first <chr> "Annetha", "David", "Edward", "Jasmi...
## $ star_baker <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, ...
## $ technical_winner <int> 0, 0, 2, 0, 1, 0, 0, 0, 2, 0, 1, 2, ...
## $ technical_top3 <int> 1, 1, 4, 2, 1, 0, 0, 0, 4, 2, 3, 5, ...
## $ technical_bottom <int> 1, 3, 1, 2, 2, 1, 1, 0, 1, 2, 1, 3, ...
## $ technical_highest <int> 2, 3, 1, 2, 1, 10, 4, NA, 1, 2, 1, 1...
## $ technical_lowest <int> 7, 8, 6, 5, 9, 10, 4, NA, 8, 5, 5, 6...
## $ technical_median <dbl> 4.5, 4.5, 2.0, 3.0, 6.0, 10.0, 4.0, ...
## $ series_winner <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ series_runner_up <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, ...
## $ total_episodes_appeared <int> 2, 4, 6, 5, 3, 1, 2, 1, 6, 6, 4, 8, ...
## $ first_date_appeared <date> 2010-08-17, 2010-08-17, 2010-08-17,...
## $ last_date_appeared <date> 2010-08-24, 2010-09-07, 2010-09-21,...
## $ first_date_us <date> NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ last_date_us <date> NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ percent_episodes_appeared <dbl> 33.33333, 66.66667, 100.00000, 83.33...
## $ percent_technical_top3 <dbl> 50.00000, 25.00000, 66.66667, 40.000...
# Create skill variable with 3 levels
bakers <- bakers %>%
mutate(skill = case_when(
star_baker > technical_winner ~ "super_star",
star_baker < technical_winner ~ "high_tech",
TRUE ~ "well_rounded"
))
# Filter zeroes to examine skill variable
bakers %>%
filter(star_baker==0 & technical_winner==0) %>%
count(skill)
## # A tibble: 1 x 2
## skill n
## <chr> <int>
## 1 well_rounded 41
# Add pipe to drop skill = NA
bakers_skill <- bakers %>%
mutate(skill = case_when(
star_baker > technical_winner ~ "super_star",
star_baker < technical_winner ~ "high_tech",
star_baker == 0 & technical_winner == 0 ~ NA_character_,
star_baker == technical_winner ~ "well_rounded"
)) %>%
drop_na(skill)
# Count bakers by skill
bakers_skill %>%
count(skill)
## # A tibble: 3 x 2
## skill n
## <chr> <int>
## 1 high_tech 24
## 2 super_star 15
## 3 well_rounded 15
# Cast skill as a factor
bakers <- bakers %>%
mutate(skill = as.factor(skill))
# Examine levels
bakers %>%
pull(skill) %>%
levels()
## [1] "high_tech" "super_star" "well_rounded"
baker_dates <- bakers %>%
select(series, baker, contains("date")) %>%
mutate(last_date_appeared_us=as.character(last_date_us),
first_date_appeared_us=as.character(first_date_us)
) %>%
rename(first_date_appeared_uk=first_date_appeared, last_date_appeared_uk=last_date_appeared) %>%
select(-last_date_us, -first_date_us)
glimpse(baker_dates)
## Observations: 95
## Variables: 6
## $ series <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, ...
## $ baker <chr> "Annetha", "David", "Edd", "Jasminder",...
## $ first_date_appeared_uk <date> 2010-08-17, 2010-08-17, 2010-08-17, 20...
## $ last_date_appeared_uk <date> 2010-08-24, 2010-09-07, 2010-09-21, 20...
## $ last_date_appeared_us <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ first_date_appeared_us <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
# Add a line to extract labeled month
baker_dates <- baker_dates %>%
mutate(last_date_appeared_us=lubridate::ymd(last_date_appeared_us),
last_month_us=lubridate::month(last_date_appeared_us, label=TRUE)
)
ggplot(baker_dates, aes(x=last_month_us)) + geom_bar()
baker_time <- baker_dates %>%
mutate(first_date_appeared_us=lubridate::ymd(first_date_appeared_us)) %>%
select(-last_month_us)
glimpse(baker_time)
## Observations: 95
## Variables: 6
## $ series <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, ...
## $ baker <chr> "Annetha", "David", "Edd", "Jasminder",...
## $ first_date_appeared_uk <date> 2010-08-17, 2010-08-17, 2010-08-17, 20...
## $ last_date_appeared_uk <date> 2010-08-24, 2010-09-07, 2010-09-21, 20...
## $ last_date_appeared_us <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ first_date_appeared_us <date> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
# Add a line to create whole months on air variable
baker_time <- baker_time %>%
mutate(time_on_air = lubridate::interval(first_date_appeared_uk, last_date_appeared_uk),
weeks_on_air = time_on_air / lubridate::weeks(1),
months_on_air = time_on_air %/% months(1)
)
# Count rows
messy_baker_results %>%
count(position_reached)
## # A tibble: 8 x 2
## position_reached n
## <chr> <int>
## 1 Runner-Up 1
## 2 Runner up 2
## 3 Runner Up 12
## 4 Third Place 1
## 5 winner 2
## 6 Winner 1
## 7 WINNER 5
## 8 <NA> 71
# Add another mutate to replace "THIRD PLACE" with "RUNNER UP"and count
messy_baker_results <- messy_baker_results %>%
mutate(position_reached = str_to_upper(position_reached),
position_reached = str_replace(position_reached, "-", " "),
position_reached = str_replace(position_reached, "THIRD PLACE", "RUNNER UP"))
# Count rows
messy_baker_results %>%
count(position_reached)
## # A tibble: 3 x 2
## position_reached n
## <chr> <int>
## 1 RUNNER UP 16
## 2 WINNER 8
## 3 <NA> 71
# Add a line to create new variable called student
bakers <- bakers %>%
mutate(occupation = str_to_lower(occupation),
student=str_detect(occupation, "student")
)
# Find all students and examine occupations
bakers %>%
filter(student) %>%
select(baker, occupation, student)
## # A tibble: 8 x 3
## baker occupation student
## <chr> <chr> <lgl>
## 1 Jason civil engineering student TRUE
## 2 James medical student TRUE
## 3 John law student TRUE
## 4 Ruby history of art and philosophy student TRUE
## 5 Martha student TRUE
## 6 Michael student TRUE
## 7 Rav student support TRUE
## 8 Liam student TRUE
Chapter 1 - Introduction to Modeling
Background on modeling for explanation:
Background on modeling for prediction:
Modeling problem for explanation:
Modeling problem for prediction:
Example code includes:
data(evals, package="moderndive")
glimpse(evals)
## Observations: 463
## Variables: 13
## $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5,...
## $ age <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 4...
## $ bty_avg <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000, ...
## $ gender <fct> female, female, female, female, male, male, male,...
## $ ethnicity <fct> minority, minority, minority, minority, not minor...
## $ language <fct> english, english, english, english, english, engl...
## $ rank <fct> tenure track, tenure track, tenure track, tenure ...
## $ pic_outfit <fct> not formal, not formal, not formal, not formal, n...
## $ pic_color <fct> color, color, color, color, color, color, color, ...
## $ cls_did_eval <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24, ...
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, 2...
## $ cls_level <fct> upper, upper, upper, upper, upper, upper, upper, ...
# Plot the histogram
ggplot(evals, aes(x = age)) +
geom_histogram(binwidth = 5) +
labs(x = "age", y = "count")
# Compute summary stats
evals %>%
summarize(mean_age = mean(age),
median_age = median(age),
sd_age = sd(age))
## # A tibble: 1 x 3
## mean_age median_age sd_age
## <dbl> <int> <dbl>
## 1 48.4 48 9.80
data(house_prices, package="moderndive")
glimpse(house_prices)
## Observations: 21,613
## Variables: 21
## $ id <chr> "7129300520", "6414100192", "5631500400", "24872...
## $ date <dttm> 2014-10-13, 2014-12-09, 2015-02-25, 2014-12-09,...
## $ price <dbl> 221900, 538000, 180000, 604000, 510000, 1225000,...
## $ bedrooms <int> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, ...
## $ bathrooms <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, ...
## $ sqft_living <int> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 1...
## $ sqft_lot <int> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 971...
## $ floors <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0...
## $ waterfront <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
## $ view <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, ...
## $ condition <fct> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, ...
## $ grade <fct> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9,...
## $ sqft_above <int> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 1...
## $ sqft_basement <int> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300...
## $ yr_built <int> 1955, 1951, 1933, 1965, 1987, 2001, 1995, 1963, ...
## $ yr_renovated <int> 0, 1991, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ zipcode <fct> 98178, 98125, 98028, 98136, 98074, 98053, 98003,...
## $ lat <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47....
## $ long <dbl> -122.257, -122.319, -122.233, -122.393, -122.045...
## $ sqft_living15 <int> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, ...
## $ sqft_lot15 <int> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711...
# Plot the histogram
ggplot(house_prices, aes(x = sqft_living)) +
geom_histogram() +
labs(x="Size (sq.feet)", y="count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Add log10_sqft_living
house_prices_2 <- house_prices %>%
mutate(log10_sqft_living = log10(sqft_living))
# Plot the histogram
ggplot(house_prices_2, aes(x = log10_sqft_living)) +
geom_histogram() +
labs(x = "log10 size", y = "count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot the histogram
ggplot(evals, aes(x=bty_avg)) +
geom_histogram(binwidth=0.5) +
labs(x = "Beauty score", y = "count")
# Scatterplot
ggplot(evals, aes(x = bty_avg, y = score)) +
geom_point() +
labs(x = "beauty score", y = "teaching score")
# Jitter plot
ggplot(evals, aes(x = bty_avg, y = score)) +
geom_jitter() +
labs(x = "beauty score", y = "teaching score")
# Compute correlation
evals %>%
summarize(correlation = cor(score, bty_avg))
## # A tibble: 1 x 1
## correlation
## <dbl>
## 1 0.187
house_prices <- house_prices %>%
mutate(log10_price=log10(price))
# View the structure of log10_price and waterfront
house_prices %>%
select(log10_price, waterfront) %>%
glimpse()
## Observations: 21,613
## Variables: 2
## $ log10_price <dbl> 5.346157, 5.730782, 5.255273, 5.781037, 5.707570, ...
## $ waterfront <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F...
# Plot
ggplot(house_prices, aes(x = waterfront, y = log10_price)) +
geom_boxplot() +
labs(x = "waterfront", y = "log10 price")
# Calculate stats
house_prices %>%
group_by(waterfront) %>%
summarize(mean_log10_price = mean(log10_price), n = n())
## # A tibble: 2 x 3
## waterfront mean_log10_price n
## <lgl> <dbl> <int>
## 1 FALSE 5.66 21450
## 2 TRUE 6.12 163
# Prediction of price for houses with view
10^(6.12)
## [1] 1318257
# Prediction of price for houses without view
10^(5.66)
## [1] 457088.2
Chapter 2 - Modeling with Regression
Explaining teaching score with age:
Predicting teaching score using age:
Explaining teaching score with gender:
Predicting teaching score with gender:
Example code includes:
# Plot
ggplot(evals, aes(x = bty_avg, y = score)) +
geom_point() +
labs(x = "beauty score", y = "score") +
geom_smooth(method = "lm", se = FALSE)
# Fit model
model_score_2 <- lm(score ~ bty_avg, data = evals)
# Output content
model_score_2
##
## Call:
## lm(formula = score ~ bty_avg, data = evals)
##
## Coefficients:
## (Intercept) bty_avg
## 3.88034 0.06664
# Output regression table
moderndive::get_regression_table(model_score_2)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 3.88 0.076 51.0 0 3.73 4.03
## 2 bty_avg 0.067 0.016 4.09 0 0.035 0.099
# Use fitted intercept and slope to get a prediction
y_hat <- 3.88 + 0.067 * 5
y_hat
## [1] 4.215
# Compute residual y - y_hat
4.7 - y_hat
## [1] 0.485
# Get regression table
moderndive::get_regression_table(model_score_2, digits = 5)
## # A tibble: 2 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 3.88 0.0761 51.0 0 3.73 4.03
## 2 bty_avg 0.0666 0.0163 4.09 0.00005 0.0346 0.0986
# Get all fitted/predicted values and residuals
moderndive::get_regression_points(model_score_2)
## # A tibble: 463 x 5
## ID score bty_avg score_hat residual
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 4.7 5 4.21 0.486
## 2 2 4.1 5 4.21 -0.114
## 3 3 3.9 5 4.21 -0.314
## 4 4 4.8 5 4.21 0.586
## 5 5 4.6 3 4.08 0.52
## 6 6 4.3 3 4.08 0.22
## 7 7 2.8 3 4.08 -1.28
## 8 8 4.1 3.33 4.10 -0.002
## 9 9 3.4 3.33 4.10 -0.702
## 10 10 4.5 3.17 4.09 0.409
## # ... with 453 more rows
# Get all fitted/predicted values and residuals
moderndive::get_regression_points(model_score_2) %>%
mutate(score_hat_2 = 3.88 + 0.0666 * bty_avg)
## # A tibble: 463 x 6
## ID score bty_avg score_hat residual score_hat_2
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 4.7 5 4.21 0.486 4.21
## 2 2 4.1 5 4.21 -0.114 4.21
## 3 3 3.9 5 4.21 -0.314 4.21
## 4 4 4.8 5 4.21 0.586 4.21
## 5 5 4.6 3 4.08 0.52 4.08
## 6 6 4.3 3 4.08 0.22 4.08
## 7 7 2.8 3 4.08 -1.28 4.08
## 8 8 4.1 3.33 4.10 -0.002 4.10
## 9 9 3.4 3.33 4.10 -0.702 4.10
## 10 10 4.5 3.17 4.09 0.409 4.09
## # ... with 453 more rows
# Get all fitted/predicted values and residuals
moderndive::get_regression_points(model_score_2) %>%
mutate(residual_2 = score - score_hat)
## # A tibble: 463 x 6
## ID score bty_avg score_hat residual residual_2
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 4.7 5 4.21 0.486 0.486
## 2 2 4.1 5 4.21 -0.114 -0.114
## 3 3 3.9 5 4.21 -0.314 -0.314
## 4 4 4.8 5 4.21 0.586 0.586
## 5 5 4.6 3 4.08 0.52 0.520
## 6 6 4.3 3 4.08 0.22 0.220
## 7 7 2.8 3 4.08 -1.28 -1.28
## 8 8 4.1 3.33 4.10 -0.002 -0.002
## 9 9 3.4 3.33 4.10 -0.702 -0.702
## 10 10 4.5 3.17 4.09 0.409 0.409
## # ... with 453 more rows
ggplot(evals, aes(x=rank, y=score)) +
geom_boxplot() +
labs(x = "rank", y = "score")
evals %>%
group_by(rank) %>%
summarize(n = n(), mean_score = mean(score), sd_score = sd(score))
## # A tibble: 3 x 4
## rank n mean_score sd_score
## <fct> <int> <dbl> <dbl>
## 1 teaching 102 4.28 0.498
## 2 tenure track 108 4.15 0.561
## 3 tenured 253 4.14 0.550
# Fit regression model
model_score_4 <- lm(score ~ rank, data = evals)
# Get regression table
moderndive::get_regression_table(model_score_4, digits = 5)
## # A tibble: 3 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 4.28 0.0536 79.9 0 4.18 4.39
## 2 ranktenure track -0.130 0.0748 -1.73 0.0837 -0.277 0.0173
## 3 ranktenured -0.145 0.0636 -2.28 0.0228 -0.270 -0.0203
# teaching mean
teaching_mean <- 4.28
# tenure track mean
tenure_track_mean <- 4.28-0.13
# tenure mean
tenure_mean <- 4.28-0.145
# Calculate predictions and residuals
model_score_4_points <- moderndive::get_regression_points(model_score_4)
model_score_4_points
## # A tibble: 463 x 5
## ID score rank score_hat residual
## <int> <dbl> <fct> <dbl> <dbl>
## 1 1 4.7 tenure track 4.16 0.545
## 2 2 4.1 tenure track 4.16 -0.055
## 3 3 3.9 tenure track 4.16 -0.255
## 4 4 4.8 tenure track 4.16 0.645
## 5 5 4.6 tenured 4.14 0.461
## 6 6 4.3 tenured 4.14 0.161
## 7 7 2.8 tenured 4.14 -1.34
## 8 8 4.1 tenured 4.14 -0.039
## 9 9 3.4 tenured 4.14 -0.739
## 10 10 4.5 tenured 4.14 0.361
## # ... with 453 more rows
# Plot residuals
ggplot(model_score_4_points, aes(x=residual)) +
geom_histogram() +
labs(x = "residuals", title = "Residuals from score ~ rank model")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Chapter 3 - Modeling with Multiple Regression
Explaining house price with year and size:
Predicting house price using year and size:
Explaining house price with size and condition:
Predicting house price using size and condition:
Example code includes:
# Create scatterplot with regression line
ggplot(house_prices, aes(x=bedrooms, y = log10_price)) +
geom_point() +
labs(x = "Number of bedrooms", y = "log10 price") +
geom_smooth(method = "lm", se = FALSE)
# Remove outlier
house_prices_transform <- house_prices %>%
filter(bedrooms < 33) %>%
mutate(log10_sqft_living=log10(sqft_living))
# Create scatterplot with regression line
ggplot(house_prices_transform, aes(x = bedrooms, y = log10_price)) +
geom_point() +
labs(x = "Number of bedrooms", y = "log10 price") +
geom_smooth(method = "lm", se = FALSE)
# Fit model
model_price_2 <- lm(log10_price ~ log10_sqft_living + bedrooms, data = house_prices_transform)
# Get regression table
moderndive::get_regression_table(model_price_2)
## # A tibble: 3 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 2.69 0.023 116. 0 2.65 2.74
## 2 log10_sqft_living 0.941 0.008 118. 0 0.925 0.957
## 3 bedrooms -0.033 0.002 -20.5 0 -0.036 -0.03
# Make prediction in log10 dollars
2.69 + 0.941 * log10(1000) - 0.033 * 3
## [1] 5.414
# Make prediction dollars
10**(2.69 + 0.941 * log10(1000) - 0.033 * 3)
## [1] 259417.9
# Automate prediction and residual computation
moderndive::get_regression_points(model_price_2) %>%
mutate(squared_residuals = residual**2) %>%
summarize(sum_squared_residuals = sum(squared_residuals))
## # A tibble: 1 x 1
## sum_squared_residuals
## <dbl>
## 1 604.
# Fit model
model_price_4 <- lm(log10_price ~ log10_sqft_living + waterfront, data = house_prices_transform)
# Get regression table
moderndive::get_regression_table(model_price_4)
## # A tibble: 3 x 7
## term estimate std_error statistic p_value lower_ci upper_ci
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 intercept 2.96 0.02 146. 0 2.92 3.00
## 2 log10_sqft_living 0.825 0.006 134. 0 0.813 0.837
## 3 waterfrontTRUE 0.322 0.013 24.5 0 0.296 0.348
# Prediction for House A
10**(2.96 + 0.825*2.9 + 0.322)
## [1] 472606.8
# Prediction for House B
10**(2.96 + 0.825*3.1 + 0)
## [1] 329230.5
# View the "new" houses
new_houses_2 <- tibble(log10_sqft_living=c(2.9, 3.1), waterfront=c(TRUE, FALSE))
new_houses_2
## # A tibble: 2 x 2
## log10_sqft_living waterfront
## <dbl> <lgl>
## 1 2.9 TRUE
## 2 3.1 FALSE
# Get predictions price_hat in dollars on "new" houses
moderndive::get_regression_points(model_price_4, newdata = new_houses_2) %>%
mutate(price_hat = 10**log10_price_hat)
## # A tibble: 2 x 5
## ID log10_sqft_living waterfront log10_price_hat price_hat
## <int> <dbl> <lgl> <dbl> <dbl>
## 1 1 2.9 TRUE 5.67 472063.
## 2 2 3.1 FALSE 5.52 328095.
Chapter 4 - Model Selection and Assessment
Model selection and assessment:
Assessing model fit with R-squared:
Assessing predictions with RMSE:
Validation set prediction framework:
Next steps:
Example code includes:
# Model 2
model_price_2 <- lm(log10_price ~ log10_sqft_living + bedrooms, data = house_prices_transform)
# Calculate squared residuals
moderndive::get_regression_points(model_price_2) %>%
mutate(sq_residuals=residual**2) %>%
summarize(sum_sq_residuals=sum(sq_residuals))
## # A tibble: 1 x 1
## sum_sq_residuals
## <dbl>
## 1 604.
# Model 4
model_price_4 <- lm(log10_price ~ log10_sqft_living + waterfront, data = house_prices_transform)
# Calculate squared residuals
moderndive::get_regression_points(model_price_4) %>%
mutate(sq_residuals = residual**2) %>%
summarize(sum_sq_residuals=sum(sq_residuals))
## # A tibble: 1 x 1
## sum_sq_residuals
## <dbl>
## 1 599.
# Get fitted/values & residuals, compute R^2 using residuals
moderndive::get_regression_points(model_price_2) %>%
summarize(r_squared = 1 - var(residual) / var(log10_price))
## # A tibble: 1 x 1
## r_squared
## <dbl>
## 1 0.466
# Get fitted/values & residuals, compute R^2 using residuals
moderndive::get_regression_points(model_price_4) %>%
summarize(r_squared = 1 - var(residual) / var(log10_price))
## # A tibble: 1 x 1
## r_squared
## <dbl>
## 1 0.470
# Get all residuals, square them, take the mean and square root
moderndive::get_regression_points(model_price_2) %>%
mutate(sq_residuals = residual^2) %>%
summarize(mse = mean(sq_residuals)) %>%
mutate(rmse = sqrt(mse))
## # A tibble: 1 x 2
## mse rmse
## <dbl> <dbl>
## 1 0.0279 0.167
# MSE and RMSE for model_price_2
moderndive::get_regression_points(model_price_2) %>%
mutate(sq_residuals = residual^2) %>%
summarize(mse = mean(sq_residuals), rmse = sqrt(mean(sq_residuals)))
## # A tibble: 1 x 2
## mse rmse
## <dbl> <dbl>
## 1 0.0279 0.167
# MSE and RMSE for model_price_4
moderndive::get_regression_points(model_price_4) %>%
mutate(sq_residuals = residual^2) %>%
summarize(mse = mean(sq_residuals), rmse = sqrt(mean(sq_residuals)))
## # A tibble: 1 x 2
## mse rmse
## <dbl> <dbl>
## 1 0.0277 0.166
# Set random number generator seed value for reproducibility
set.seed(76)
# Randomly reorder the rows
house_prices_shuffled <- house_prices_transform %>%
sample_frac(size = 1, replace = FALSE)
# Train/test split
train <- house_prices_shuffled %>%
slice(1:10000)
test <- house_prices_shuffled %>%
slice(10001:nrow(.))
# Fit model to training set
train_model_2 <- lm(log10_price ~ log10_sqft_living + bedrooms, data=train)
# Compute RMSE (train)
moderndive::get_regression_points(train_model_2) %>%
mutate(sq_residuals = residual**2) %>%
summarize(rmse = sqrt(mean(sq_residuals)))
## # A tibble: 1 x 1
## rmse
## <dbl>
## 1 0.167
# Compute RMSE (test)
moderndive::get_regression_points(train_model_2, newdata = test) %>%
mutate(sq_residuals = residual**2) %>%
summarize(rmse = sqrt(mean(sq_residuals)))
## # A tibble: 1 x 1
## rmse
## <dbl>
## 1 0.167
Chapter 1 - Introduction to Survey Data
What are survey weights?
Specifying elements of the design in R:
Visualizing impact of survey weights:
Example code includes:
colTypes <- "FINLWT21 numeric _ FINCBTAX integer _ BLS_URBN integer _ POPSIZE integer _ EDUC_REF character _ EDUCA2 character _ AGE_REF integer _ AGE2 character _ SEX_REF integer _ SEX2 integer _ REF_RACE integer _ RACE2 integer _ HISP_REF integer _ HISP2 integer _ FAM_TYPE integer _ MARITAL1 integer _ REGION integer _ SMSASTAT integer _ HIGH_EDU character _ EHOUSNGC numeric _ TOTEXPCQ numeric _ FOODCQ numeric _ TRANSCQ numeric _ HEALTHCQ numeric _ ENTERTCQ numeric _ EDUCACQ integer _ TOBACCCQ numeric _ STUDFINX character _ IRAX character _ CUTENURE integer _ FAM_SIZE integer _ VEHQ integer _ ROOMSQ character _ INC_HRS1 character _ INC_HRS2 character _ EARNCOMP integer _ NO_EARNR integer _ OCCUCOD1 character _ OCCUCOD2 character _ STATE character _ DIVISION integer _ TOTXEST integer _ CREDFINX character _ CREDITB integer _ CREDITX character _ BUILDING character _ ST_HOUS integer _ INT_PHON character _ INT_HOME character _ "
ce <- readr::read_csv("./RInputFiles/ce.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## FINLWT21 = col_double(),
## EDUC_REF = col_character(),
## EDUCA2 = col_character(),
## AGE2 = col_character(),
## HIGH_EDU = col_character(),
## EHOUSNGC = col_double(),
## TOTEXPCQ = col_double(),
## FOODCQ = col_double(),
## TRANSCQ = col_double(),
## HEALTHCQ = col_double(),
## ENTERTCQ = col_double(),
## TOBACCCQ = col_double(),
## STUDFINX = col_character(),
## IRAX = col_character(),
## ROOMSQ = col_character(),
## INC_HRS1 = col_character(),
## INC_HRS2 = col_character(),
## OCCUCOD1 = col_character(),
## OCCUCOD2 = col_character(),
## STATE = col_character()
## # ... with 5 more columns
## )
## See spec(...) for full column specifications.
glimpse(ce)
## Observations: 6,301
## Variables: 49
## $ FINLWT21 <dbl> 25984.767, 6581.018, 20208.499, 18078.372, 20111.619,...
## $ FINCBTAX <int> 116920, 200, 117000, 0, 2000, 942, 0, 91000, 95000, 4...
## $ BLS_URBN <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ POPSIZE <int> 2, 3, 4, 2, 2, 2, 1, 2, 5, 2, 3, 2, 2, 3, 4, 3, 3, 1,...
## $ EDUC_REF <chr> "16", "15", "16", "15", "14", "11", "10", "13", "12",...
## $ EDUCA2 <chr> "15", "15", "13", NA, NA, NA, NA, "15", "15", "14", "...
## $ AGE_REF <int> 63, 50, 47, 37, 51, 63, 77, 37, 51, 64, 26, 59, 81, 5...
## $ AGE2 <chr> "50", "47", "46", ".", ".", ".", ".", "36", "53", "67...
## $ SEX_REF <int> 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,...
## $ SEX2 <int> 2, 2, 1, NA, NA, NA, NA, 2, 2, 1, 1, 1, NA, NA, NA, 1...
## $ REF_RACE <int> 1, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,...
## $ RACE2 <int> 1, 4, 1, NA, NA, NA, NA, 1, 1, 1, 1, 1, NA, NA, NA, 2...
## $ HISP_REF <int> 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,...
## $ HISP2 <int> 2, 2, 1, NA, NA, NA, NA, 2, 2, 2, 2, 2, NA, NA, NA, 2...
## $ FAM_TYPE <int> 3, 4, 1, 8, 9, 9, 8, 3, 1, 1, 3, 1, 8, 9, 8, 5, 9, 4,...
## $ MARITAL1 <int> 1, 1, 1, 5, 3, 3, 2, 1, 1, 1, 1, 1, 2, 3, 5, 1, 3, 1,...
## $ REGION <int> 4, 4, 3, 4, 4, 3, 4, 1, 3, 2, 1, 4, 1, 3, 3, 3, 2, 1,...
## $ SMSASTAT <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ HIGH_EDU <chr> "16", "15", "16", "15", "14", "11", "10", "15", "15",...
## $ EHOUSNGC <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ TOTEXPCQ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FOODCQ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ TRANSCQ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ HEALTHCQ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ENTERTCQ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ EDUCACQ <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ TOBACCCQ <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ STUDFINX <chr> ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", "."...
## $ IRAX <chr> "1000000", "10000", "0", ".", ".", "0", "0", "15000",...
## $ CUTENURE <int> 1, 1, 1, 1, 1, 2, 4, 1, 1, 2, 1, 2, 2, 2, 2, 4, 1, 1,...
## $ FAM_SIZE <int> 4, 6, 2, 1, 2, 2, 1, 5, 2, 2, 4, 2, 1, 2, 1, 4, 2, 4,...
## $ VEHQ <int> 3, 5, 0, 4, 2, 0, 0, 2, 4, 2, 3, 2, 1, 3, 1, 2, 4, 4,...
## $ ROOMSQ <chr> "8", "5", "6", "4", "4", "4", "7", "5", "4", "9", "6"...
## $ INC_HRS1 <chr> "40", "40", "40", "44", "40", ".", ".", "40", "40", "...
## $ INC_HRS2 <chr> "30", "40", "52", ".", ".", ".", ".", "40", "40", "."...
## $ EARNCOMP <int> 3, 2, 2, 1, 4, 7, 8, 2, 2, 8, 2, 8, 8, 7, 8, 2, 7, 3,...
## $ NO_EARNR <int> 4, 2, 2, 1, 2, 1, 0, 2, 2, 0, 2, 0, 0, 1, 0, 2, 1, 3,...
## $ OCCUCOD1 <chr> "03", "03", "05", "03", "04", NA, NA, "12", "04", NA,...
## $ OCCUCOD2 <chr> "04", "02", "01", NA, NA, NA, NA, "02", "03", NA, "11...
## $ STATE <chr> "41", "15", "48", "06", "06", "48", "06", "42", NA, "...
## $ DIVISION <int> 9, 9, 7, 9, 9, 7, 9, 2, NA, 4, 1, 8, 2, 5, 6, 7, 3, 2...
## $ TOTXEST <int> 15452, 11459, 15738, 25978, 588, 0, 0, 7261, 9406, -1...
## $ CREDFINX <chr> "0", ".", "0", ".", "5", ".", ".", ".", ".", "0", "."...
## $ CREDITB <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ CREDITX <chr> "4000", "5000", "2000", ".", "7000", "1800", ".", "60...
## $ BUILDING <chr> "01", "01", "01", "02", "08", "01", "01", "01", "01",...
## $ ST_HOUS <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,...
## $ INT_PHON <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ INT_HOME <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
ceColTypes <- ""
for (x in names(ce)) { ceColTypes <- paste0(ceColTypes, x, " ", class(ce[, x, drop=TRUE]), " _ ") }
all.equal(colTypes, ceColTypes)
## [1] TRUE
# Construct a histogram of the weights
ggplot(data = ce, mapping = aes(x = FINLWT21)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# In the next few exercises we will practice specifying sampling designs using different samples from the api dataset, located in the survey package
# The api dataset contains the Academic Performance Index and demographic information for schools in California
# The apisrs dataset is a simple random sample of schools from the api dataset
# Notice that pw contains the survey weights and fpc contains the total number of schools in the population
data(api, package="survey")
library(survey)
## Loading required package: grid
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
# Look at the apisrs dataset
glimpse(apisrs)
## Observations: 200
## Variables: 39
## $ cds <chr> "15739081534155", "19642126066716", "30664493030640",...
## $ stype <fct> H, E, H, E, E, E, M, E, E, E, E, H, M, E, E, E, M, M,...
## $ name <chr> "McFarland High", "Stowers (Cecil ", "Brea-Olinda Hig...
## $ sname <chr> "McFarland High", "Stowers (Cecil B.) Elementary", "B...
## $ snum <dbl> 1039, 1124, 2868, 1273, 4926, 2463, 2031, 1736, 2142,...
## $ dname <chr> "McFarland Unified", "ABC Unified", "Brea-Olinda Unif...
## $ dnum <int> 432, 1, 79, 187, 640, 284, 401, 401, 470, 632, 401, 7...
## $ cname <chr> "Kern", "Los Angeles", "Orange", "Los Angeles", "San ...
## $ cnum <int> 14, 18, 29, 18, 39, 18, 18, 18, 18, 37, 18, 24, 14, 1...
## $ flag <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ pcttest <int> 98, 100, 98, 99, 99, 93, 98, 99, 100, 90, 95, 100, 97...
## $ api00 <int> 462, 878, 734, 772, 739, 835, 456, 506, 543, 649, 556...
## $ api99 <int> 448, 831, 742, 657, 719, 822, 472, 474, 458, 604, 575...
## $ target <int> 18, NA, 3, 7, 4, NA, 16, 16, 17, 10, 11, 9, 14, 5, 15...
## $ growth <int> 14, 47, -8, 115, 20, 13, -16, 32, 85, 45, -19, 51, 4,...
## $ sch.wide <fct> No, Yes, No, Yes, Yes, Yes, No, Yes, Yes, Yes, No, Ye...
## $ comp.imp <fct> Yes, Yes, No, Yes, Yes, Yes, No, Yes, Yes, No, No, Ye...
## $ both <fct> No, Yes, No, Yes, Yes, Yes, No, Yes, Yes, No, No, Yes...
## $ awards <fct> No, Yes, No, Yes, Yes, No, No, Yes, Yes, No, No, Yes,...
## $ meals <int> 44, 8, 10, 70, 43, 16, 81, 98, 94, 85, 81, 67, 77, 20...
## $ ell <int> 31, 25, 10, 25, 12, 19, 40, 65, 65, 57, 4, 25, 32, 16...
## $ yr.rnd <fct> NA, NA, NA, NA, NA, NA, NA, No, NA, NA, NA, NA, NA, N...
## $ mobility <int> 6, 15, 7, 23, 12, 13, 22, 43, 15, 10, 20, 12, 4, 32, ...
## $ acs.k3 <int> NA, 19, NA, 23, 20, 19, NA, 18, 19, 16, 16, NA, NA, 1...
## $ acs.46 <int> NA, 30, NA, NA, 29, 29, 30, 29, 32, 25, 27, NA, NA, 2...
## $ acs.core <int> 24, NA, 28, NA, NA, NA, 27, NA, NA, 30, NA, 17, 27, N...
## $ pct.resp <int> 82, 97, 95, 100, 91, 71, 49, 75, 99, 49, 62, 96, 77, ...
## $ not.hsg <int> 44, 4, 5, 37, 8, 1, 30, 49, 48, 23, 5, 44, 40, 4, 14,...
## $ hsg <int> 34, 10, 9, 40, 21, 8, 27, 31, 34, 36, 38, 19, 34, 14,...
## $ some.col <int> 12, 23, 21, 14, 27, 20, 18, 15, 14, 14, 29, 17, 16, 2...
## $ col.grad <int> 7, 43, 41, 8, 34, 38, 22, 2, 4, 21, 24, 19, 8, 37, 10...
## $ grad.sch <int> 3, 21, 24, 1, 10, 34, 2, 3, 1, 6, 5, 2, 2, 19, 1, 3, ...
## $ avg.ed <dbl> 1.91, 3.66, 3.71, 1.96, 3.17, 3.96, 2.39, 1.79, 1.77,...
## $ full <int> 71, 90, 83, 85, 100, 75, 72, 69, 68, 81, 84, 100, 89,...
## $ emer <int> 35, 10, 18, 18, 0, 20, 25, 22, 29, 7, 16, 0, 11, 5, 6...
## $ enroll <int> 477, 478, 1410, 342, 217, 258, 1274, 566, 645, 311, 3...
## $ api.stu <int> 429, 420, 1287, 291, 189, 211, 1090, 353, 563, 258, 2...
## $ pw <dbl> 30.97, 30.97, 30.97, 30.97, 30.97, 30.97, 30.97, 30.9...
## $ fpc <dbl> 6194, 6194, 6194, 6194, 6194, 6194, 6194, 6194, 6194,...
# Specify a simple random sampling for apisrs
apisrs_design <- svydesign(data = apisrs, weights = ~pw, fpc = ~fpc, id = ~1)
# Print a summary of the design
summary(apisrs_design)
## Independent Sampling design
## svydesign(data = apisrs, weights = ~pw, fpc = ~fpc, id = ~1)
## Probabilities:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03229 0.03229 0.03229 0.03229 0.03229 0.03229
## Population size (PSUs): 6194
## Data variables:
## [1] "cds" "stype" "name" "sname" "snum" "dname"
## [7] "dnum" "cname" "cnum" "flag" "pcttest" "api00"
## [13] "api99" "target" "growth" "sch.wide" "comp.imp" "both"
## [19] "awards" "meals" "ell" "yr.rnd" "mobility" "acs.k3"
## [25] "acs.46" "acs.core" "pct.resp" "not.hsg" "hsg" "some.col"
## [31] "col.grad" "grad.sch" "avg.ed" "full" "emer" "enroll"
## [37] "api.stu" "pw" "fpc"
# Now let's practice specifying a stratified sampling design, using the dataset apistrat
# The schools are stratified based on the school type stype where E = Elementary, M = Middle, and H = High School
# For each school type, a simple random sample of schools was taken
# Glimpse the data
glimpse(apistrat)
## Observations: 200
## Variables: 39
## $ cds <chr> "19647336097927", "19647336016018", "19648816021505",...
## $ stype <fct> E, E, E, E, E, E, E, E, E, E, M, M, H, M, H, E, E, M,...
## $ name <chr> "Open Magnet: Ce", "Belvedere Eleme", "Altadena Eleme...
## $ sname <chr> "Open Magnet: Center for Individual (Char", "Belveder...
## $ snum <dbl> 2077, 1622, 2236, 1921, 6140, 6077, 6071, 904, 4637, ...
## $ dname <chr> "Los Angeles Unified", "Los Angeles Unified", "Pasade...
## $ dnum <int> 401, 401, 541, 401, 460, 689, 689, 41, 702, 135, 590,...
## $ cname <chr> "Los Angeles", "Los Angeles", "Los Angeles", "Los Ang...
## $ cnum <int> 18, 18, 18, 18, 55, 55, 55, 14, 36, 36, 35, 32, 9, 1,...
## $ flag <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ pcttest <int> 99, 100, 99, 100, 100, 100, 99, 98, 100, 100, 99, 99,...
## $ api00 <int> 840, 516, 531, 501, 720, 805, 778, 731, 592, 669, 496...
## $ api99 <int> 816, 476, 544, 457, 659, 780, 787, 731, 508, 658, 479...
## $ target <int> NA, 16, 13, 17, 7, 1, 1, 3, 15, 7, 16, 15, 17, 20, 13...
## $ growth <int> 24, 40, -13, 44, 61, 25, -9, 0, 84, 11, 17, 6, 7, 3, ...
## $ sch.wide <fct> Yes, Yes, No, Yes, Yes, Yes, No, No, Yes, Yes, Yes, N...
## $ comp.imp <fct> No, Yes, No, Yes, Yes, Yes, No, No, Yes, No, No, No, ...
## $ both <fct> No, Yes, No, Yes, Yes, Yes, No, No, Yes, No, No, No, ...
## $ awards <fct> No, Yes, No, Yes, Yes, Yes, No, No, Yes, No, No, No, ...
## $ meals <int> 33, 98, 64, 83, 26, 7, 9, 45, 75, 47, 69, 60, 66, 54,...
## $ ell <int> 25, 77, 23, 63, 17, 0, 2, 2, 58, 23, 25, 10, 43, 26, ...
## $ yr.rnd <fct> No, Yes, No, No, No, No, No, No, Yes, No, No, No, No,...
## $ mobility <int> 11, 26, 17, 13, 31, 12, 10, 15, 23, 19, 26, 22, 16, 4...
## $ acs.k3 <int> 20, 19, 20, 17, 20, 19, 19, 19, 20, 18, NA, NA, NA, N...
## $ acs.46 <int> 29, 28, 30, 30, 30, 29, 31, 31, 32, 29, 32, 32, NA, 3...
## $ acs.core <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 30, 32, 27, 2...
## $ pct.resp <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 87, 67, 50, 70, 71, ...
## $ not.hsg <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 31, 49, 12, 20, 45, ...
## $ hsg <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 34, 20, 33, 20, 36, ...
## $ some.col <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 22, 15, 23, 31, 11, ...
## $ col.grad <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 29, 23, 8, 9...
## $ grad.sch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 3, 6, 0, 0, 11...
## $ avg.ed <dbl> 3.32, 1.67, 2.34, 1.86, 3.17, 3.64, 3.55, 3.10, 2.17,...
## $ full <int> 100, 57, 81, 64, 90, 95, 96, 93, 91, 96, 84, 65, 93, ...
## $ emer <int> 0, 40, 26, 24, 7, 0, 0, 8, 14, 0, 18, 37, 17, 26, 19,...
## $ enroll <int> 276, 841, 441, 298, 354, 330, 385, 583, 763, 381, 129...
## $ api.stu <int> 241, 631, 415, 288, 319, 315, 363, 510, 652, 322, 103...
## $ pw <dbl> 44.21, 44.21, 44.21, 44.21, 44.21, 44.21, 44.21, 44.2...
## $ fpc <dbl> 4421, 4421, 4421, 4421, 4421, 4421, 4421, 4421, 4421,...
# Summarize strata sample sizes
apistrat %>%
count(stype)
## # A tibble: 3 x 2
## stype n
## <fct> <int>
## 1 E 100
## 2 H 50
## 3 M 50
# Specify the design
strat_design <- svydesign(data = apistrat, weights = ~pw, fpc = ~fpc, id = ~1, strata = ~stype)
# Look at the summary information for the stratified design
summary(strat_design)
## Stratified Independent Sampling design
## svydesign(data = apistrat, weights = ~pw, fpc = ~fpc, id = ~1,
## strata = ~stype)
## Probabilities:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02262 0.02262 0.03587 0.04014 0.05339 0.06623
## Stratum Sizes:
## E H M
## obs 100 50 50
## design.PSU 100 50 50
## actual.PSU 100 50 50
## Population stratum sizes (PSUs):
## E H M
## 4421 755 1018
## Data variables:
## [1] "cds" "stype" "name" "sname" "snum" "dname"
## [7] "dnum" "cname" "cnum" "flag" "pcttest" "api00"
## [13] "api99" "target" "growth" "sch.wide" "comp.imp" "both"
## [19] "awards" "meals" "ell" "yr.rnd" "mobility" "acs.k3"
## [25] "acs.46" "acs.core" "pct.resp" "not.hsg" "hsg" "some.col"
## [31] "col.grad" "grad.sch" "avg.ed" "full" "emer" "enroll"
## [37] "api.stu" "pw" "fpc"
# Now let's practice specifying a cluster sampling design, using the dataset apiclus2
# The schools were clustered based on school districts, dnum
# Within a sampled school district, 5 schools were randomly selected for the sample
# The schools are denoted by snum
# The number of districts is given by fpc1 and the number of schools in the sampled districts is given by fpc2
# Glimpse the data
glimpse(apiclus2)
## Observations: 126
## Variables: 40
## $ cds <chr> "31667796031017", "55751846054837", "41688746043517",...
## $ stype <fct> E, E, E, M, E, E, E, E, M, H, E, M, E, E, E, E, H, E,...
## $ name <chr> "Alta-Dutch Flat", "Tenaya Elementa", "Panorama Eleme...
## $ sname <chr> "Alta-Dutch Flat Elementary", "Tenaya Elementary", "P...
## $ snum <dbl> 3269, 5979, 4958, 4957, 4956, 4915, 2548, 2550, 2549,...
## $ dname <chr> "Alta-Dutch Flat Elem", "Big Oak Flat-Grvlnd Unif", "...
## $ dnum <int> 15, 63, 83, 83, 83, 117, 132, 132, 132, 152, 152, 152...
## $ cname <chr> "Placer", "Tuolumne", "San Mateo", "San Mateo", "San ...
## $ cnum <int> 30, 54, 40, 40, 40, 39, 19, 19, 19, 5, 5, 5, 36, 36, ...
## $ flag <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ pcttest <int> 100, 100, 98, 100, 98, 100, 100, 100, 100, 96, 98, 10...
## $ api00 <int> 821, 773, 600, 740, 716, 811, 472, 520, 568, 591, 544...
## $ api99 <int> 785, 718, 632, 740, 711, 779, 432, 494, 589, 585, 554...
## $ target <int> 1, 4, 8, 3, 4, 1, 18, 15, 11, 11, 12, 11, NA, NA, NA,...
## $ growth <int> 36, 55, -32, 0, 5, 32, 40, 26, -21, 6, -10, 29, 14, 2...
## $ sch.wide <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes...
## $ comp.imp <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes...
## $ both <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes...
## $ awards <fct> Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No, No, Yes...
## $ meals <int> 27, 43, 33, 11, 5, 25, 78, 76, 68, 42, 63, 54, 0, 4, ...
## $ ell <int> 0, 0, 5, 4, 2, 5, 38, 34, 34, 23, 42, 24, 3, 6, 2, 1,...
## $ yr.rnd <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N...
## $ mobility <int> 14, 12, 9, 8, 6, 19, 13, 13, 15, 4, 15, 15, 24, 19, 1...
## $ acs.k3 <int> 17, 18, 19, NA, 18, 20, 19, 25, NA, NA, 20, NA, 19, 1...
## $ acs.46 <int> 20, 34, 29, 30, 28, 22, NA, 23, 24, NA, NA, 27, 27, 2...
## $ acs.core <int> NA, NA, NA, 24, NA, 31, NA, NA, 25, 21, NA, 18, NA, N...
## $ pct.resp <int> 89, 98, 79, 96, 98, 93, 100, 46, 91, 94, 93, 88, 90, ...
## $ not.hsg <int> 4, 8, 8, 5, 3, 5, 48, 30, 63, 20, 29, 27, 0, 1, 0, 1,...
## $ hsg <int> 16, 33, 28, 27, 14, 9, 32, 27, 16, 18, 32, 25, 0, 7, ...
## $ some.col <int> 53, 37, 30, 35, 22, 30, 15, 21, 13, 27, 26, 24, 4, 8,...
## $ col.grad <int> 21, 15, 32, 27, 58, 37, 4, 13, 6, 28, 7, 18, 51, 42, ...
## $ grad.sch <int> 6, 7, 1, 6, 3, 19, 1, 9, 2, 7, 6, 7, 44, 41, 100, 45,...
## $ avg.ed <dbl> 3.07, 2.79, 2.90, 3.03, 3.44, 3.56, 1.77, 2.42, 1.68,...
## $ full <int> 100, 100, 100, 82, 100, 94, 96, 86, 75, 100, 100, 97,...
## $ emer <int> 0, 0, 0, 18, 8, 6, 8, 24, 21, 4, 4, 3, 0, 4, 0, 4, 28...
## $ enroll <int> 152, 312, 173, 201, 147, 234, 184, 512, 543, 332, 217...
## $ api.stu <int> 120, 270, 151, 179, 136, 189, 158, 419, 423, 303, 182...
## $ pw <dbl> 18.925, 18.925, 18.925, 18.925, 18.925, 18.925, 18.92...
## $ fpc1 <dbl> 757, 757, 757, 757, 757, 757, 757, 757, 757, 757, 757...
## $ fpc2 <int> 1, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 1, 4,...
# Specify the design
apiclus_design <- svydesign(id = ~dnum + snum, data = apiclus2, weights = ~pw, fpc = ~fpc1 + fpc2)
#Look at the summary information stored for both designs
summary(apiclus_design)
## 2 - level Cluster Sampling design
## With (40, 126) clusters.
## svydesign(id = ~dnum + snum, data = apiclus2, weights = ~pw,
## fpc = ~fpc1 + fpc2)
## Probabilities:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.003669 0.037743 0.052840 0.042390 0.052840 0.052840
## Population size (PSUs): 757
## Data variables:
## [1] "cds" "stype" "name" "sname" "snum" "dname"
## [7] "dnum" "cname" "cnum" "flag" "pcttest" "api00"
## [13] "api99" "target" "growth" "sch.wide" "comp.imp" "both"
## [19] "awards" "meals" "ell" "yr.rnd" "mobility" "acs.k3"
## [25] "acs.46" "acs.core" "pct.resp" "not.hsg" "hsg" "some.col"
## [31] "col.grad" "grad.sch" "avg.ed" "full" "emer" "enroll"
## [37] "api.stu" "pw" "fpc1" "fpc2"
# Construct histogram of pw
ggplot(data = apisrs, mapping = aes(x = pw)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Computation failed in `stat_bin()`:
## `binwidth` must be positive
# Construct histogram of pw
ggplot(data = apistrat, mapping = aes(x = pw)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Construct histogram of pw
ggplot(data = apiclus2, mapping = aes(x = pw)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
NHANESraw <- read.csv("./RInputFiles/NHANESraw.txt")
NHANESraw <- NHANESraw %>%
mutate(WTMEC4YR=WTMEC2YR / 2)
names(NHANESraw)[1] <- "SurveyYr"
glimpse(NHANESraw)
## Observations: 20,293
## Variables: 78
## $ SurveyYr <fct> 2009_10, 2009_10, 2009_10, 2009_10, 2009_1...
## $ ID <int> 51624, 51625, 51626, 51627, 51628, 51629, ...
## $ Gender <fct> male, male, male, male, female, male, fema...
## $ Age <int> 34, 4, 16, 10, 60, 26, 49, 1, 10, 80, 10, ...
## $ AgeMonths <int> 409, 49, 202, 131, 722, 313, 596, 12, 124,...
## $ Race1 <fct> White, Other, Black, Black, Black, Mexican...
## $ Race3 <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Education <fct> c_HighSchool, NA, NA, NA, c_HighSchool, b_...
## $ MaritalStatus <fct> Married, NA, NA, NA, Widowed, Married, Liv...
## $ HHIncome <fct> 25000-34999, 20000-24999, 45000-54999, 200...
## $ HHIncomeMid <int> 30000, 22500, 50000, 22500, 12500, 30000, ...
## $ Poverty <dbl> 1.36, 1.07, 2.27, 0.81, 0.69, 1.01, 1.91, ...
## $ HomeRooms <int> 6, 9, 5, 6, 6, 4, 5, 5, 7, 4, 5, 5, 7, NA,...
## $ HomeOwn <fct> Own, Own, Own, Rent, Rent, Rent, Rent, Ren...
## $ Work <fct> NotWorking, NA, NotWorking, NA, NotWorking...
## $ Weight <dbl> 87.4, 17.0, 72.3, 39.8, 116.8, 97.6, 86.7,...
## $ Length <dbl> NA, NA, NA, NA, NA, NA, NA, 75.7, NA, NA, ...
## $ HeadCirc <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Height <dbl> 164.7, 105.4, 181.3, 147.8, 166.0, 173.0, ...
## $ BMI <dbl> 32.22, 15.30, 22.00, 18.22, 42.39, 32.61, ...
## $ BMICatUnder20yrs <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ BMI_WHO <fct> 30.0_plus, 12.0_18.5, 18.5_to_24.9, 12.0_1...
## $ Pulse <int> 70, NA, 68, 68, 72, 72, 86, NA, 70, 88, 84...
## $ BPSysAve <int> 113, NA, 109, 93, 150, 104, 112, NA, 108, ...
## $ BPDiaAve <int> 85, NA, 59, 41, 68, 49, 75, NA, 53, 43, 45...
## $ BPSys1 <int> 114, NA, 112, 92, 154, 102, 118, NA, 106, ...
## $ BPDia1 <int> 88, NA, 62, 36, 70, 50, 82, NA, 60, 62, 38...
## $ BPSys2 <int> 114, NA, 114, 94, 150, 104, 108, NA, 106, ...
## $ BPDia2 <int> 88, NA, 60, 44, 68, 48, 74, NA, 50, 46, 40...
## $ BPSys3 <int> 112, NA, 104, 92, 150, 104, 116, NA, 110, ...
## $ BPDia3 <int> 82, NA, 58, 38, 68, 50, 76, NA, 56, 40, 50...
## $ Testosterone <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ DirectChol <dbl> 1.29, NA, 1.55, 1.89, 1.16, 1.16, 1.16, NA...
## $ TotChol <dbl> 3.49, NA, 4.97, 4.16, 5.22, 4.14, 6.70, NA...
## $ UrineVol1 <int> 352, NA, 281, 139, 30, 202, 77, NA, 39, 12...
## $ UrineFlow1 <dbl> NA, NA, 0.415, 1.078, 0.476, 0.563, 0.094,...
## $ UrineVol2 <int> NA, NA, NA, NA, 246, NA, NA, NA, NA, NA, N...
## $ UrineFlow2 <dbl> NA, NA, NA, NA, 2.51, NA, NA, NA, NA, NA, ...
## $ Diabetes <fct> No, No, No, No, Yes, No, No, No, No, No, N...
## $ DiabetesAge <int> NA, NA, NA, NA, 56, NA, NA, NA, NA, NA, NA...
## $ HealthGen <fct> Good, NA, Vgood, NA, Fair, Good, Good, NA,...
## $ DaysPhysHlthBad <int> 0, NA, 2, NA, 20, 2, 0, NA, NA, 0, NA, 0, ...
## $ DaysMentHlthBad <int> 15, NA, 0, NA, 25, 14, 10, NA, NA, 0, NA, ...
## $ LittleInterest <fct> Most, NA, NA, NA, Most, None, Several, NA,...
## $ Depressed <fct> Several, NA, NA, NA, Most, Most, Several, ...
## $ nPregnancies <int> NA, NA, NA, NA, 1, NA, 2, NA, NA, NA, NA, ...
## $ nBabies <int> NA, NA, NA, NA, 1, NA, 2, NA, NA, NA, NA, ...
## $ Age1stBaby <int> NA, NA, NA, NA, NA, NA, 27, NA, NA, NA, NA...
## $ SleepHrsNight <int> 4, NA, 8, NA, 4, 4, 8, NA, NA, 6, NA, 9, N...
## $ SleepTrouble <fct> Yes, NA, No, NA, No, No, Yes, NA, NA, No, ...
## $ PhysActive <fct> No, NA, Yes, NA, No, Yes, No, NA, NA, Yes,...
## $ PhyActiveDays <int> NA, NA, 5, NA, NA, 2, NA, NA, NA, 4, NA, N...
## $ TVHrsDay <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ TVHrsDay.1 <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ TVHrsDayChild <int> NA, 4, NA, 1, NA, NA, NA, NA, 1, NA, 3, NA...
## $ ComputerHrsDayChild <int> NA, 1, NA, 1, NA, NA, NA, NA, 0, NA, 0, NA...
## $ Alcohol12PlusYr <fct> Yes, NA, NA, NA, No, Yes, Yes, NA, NA, Yes...
## $ AlcoholDay <int> NA, NA, NA, NA, NA, 19, 2, NA, NA, 1, NA, ...
## $ AlcoholYear <int> 0, NA, NA, NA, 0, 48, 20, NA, NA, 52, NA, ...
## $ SmokeNow <fct> No, NA, NA, NA, Yes, No, Yes, NA, NA, No, ...
## $ Smoke100 <fct> Yes, NA, NA, NA, Yes, Yes, Yes, NA, NA, Ye...
## $ SmokeAge <int> 18, NA, NA, NA, 16, 15, 38, NA, NA, 16, NA...
## $ Marijuana <fct> Yes, NA, NA, NA, NA, Yes, Yes, NA, NA, NA,...
## $ AgeFirstMarij <int> 17, NA, NA, NA, NA, 10, 18, NA, NA, NA, NA...
## $ RegularMarij <fct> No, NA, NA, NA, NA, Yes, No, NA, NA, NA, N...
## $ AgeRegMarij <int> NA, NA, NA, NA, NA, 12, NA, NA, NA, NA, NA...
## $ HardDrugs <fct> Yes, NA, NA, NA, No, Yes, Yes, NA, NA, NA,...
## $ SexEver <fct> Yes, NA, NA, NA, Yes, Yes, Yes, NA, NA, NA...
## $ SexAge <int> 16, NA, NA, NA, 15, 9, 12, NA, NA, NA, NA,...
## $ SexNumPartnLife <int> 8, NA, NA, NA, 4, 10, 10, NA, NA, NA, NA, ...
## $ SexNumPartYear <int> 1, NA, NA, NA, NA, 1, 1, NA, NA, NA, NA, N...
## $ SameSex <fct> No, NA, NA, NA, No, No, Yes, NA, NA, NA, N...
## $ SexOrientation <fct> Heterosexual, NA, NA, NA, NA, Heterosexual...
## $ WTINT2YR <dbl> 80100.544, 53901.104, 13953.078, 11664.899...
## $ WTMEC2YR <dbl> 81528.772, 56995.035, 14509.279, 12041.635...
## $ SDMVPSU <int> 1, 2, 1, 2, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, ...
## $ SDMVSTRA <int> 83, 79, 84, 86, 75, 88, 85, 86, 88, 77, 86...
## $ WTMEC4YR <dbl> 40764.386, 28497.518, 7254.639, 6020.818, ...
#Create table of average survey weights by race
tab_weights <- NHANESraw %>%
group_by(Race1) %>%
summarize(avg_wt = mean(WTMEC4YR))
#Print the table
tab_weights
## # A tibble: 5 x 2
## Race1 avg_wt
## <fct> <dbl>
## 1 Black 8026.
## 2 Hispanic 8579.
## 3 Mexican 8216.
## 4 Other 10116.
## 5 White 26236.
# The two important design variables in NHANESraw are SDMVSTRA, which contains the strata assignment for each unit, and SDMVPSU, which contains the cluster id within a given stratum
# Specify the NHANES design
NHANES_design <- svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU,
nest = TRUE, weights = ~WTMEC4YR
)
# Print summary of design
summary(NHANES_design)
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (62) clusters.
## svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU,
## nest = TRUE, weights = ~WTMEC4YR)
## Probabilities:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.986e-06 5.664e-05 1.054e-04 Inf 1.721e-04 Inf
## Stratum Sizes:
## 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## obs 803 785 823 829 696 751 696 724 713 683 592 946 598 647 251 862
## design.PSU 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 3
## actual.PSU 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 3
## 91 92 93 94 95 96 97 98 99 100 101 102 103
## obs 998 875 602 688 722 676 608 708 682 700 715 624 296
## design.PSU 3 3 2 2 2 2 2 2 2 2 2 2 2
## actual.PSU 3 3 2 2 2 2 2 2 2 2 2 2 2
## Data variables:
## [1] "SurveyYr" "ID" "Gender"
## [4] "Age" "AgeMonths" "Race1"
## [7] "Race3" "Education" "MaritalStatus"
## [10] "HHIncome" "HHIncomeMid" "Poverty"
## [13] "HomeRooms" "HomeOwn" "Work"
## [16] "Weight" "Length" "HeadCirc"
## [19] "Height" "BMI" "BMICatUnder20yrs"
## [22] "BMI_WHO" "Pulse" "BPSysAve"
## [25] "BPDiaAve" "BPSys1" "BPDia1"
## [28] "BPSys2" "BPDia2" "BPSys3"
## [31] "BPDia3" "Testosterone" "DirectChol"
## [34] "TotChol" "UrineVol1" "UrineFlow1"
## [37] "UrineVol2" "UrineFlow2" "Diabetes"
## [40] "DiabetesAge" "HealthGen" "DaysPhysHlthBad"
## [43] "DaysMentHlthBad" "LittleInterest" "Depressed"
## [46] "nPregnancies" "nBabies" "Age1stBaby"
## [49] "SleepHrsNight" "SleepTrouble" "PhysActive"
## [52] "PhyActiveDays" "TVHrsDay" "TVHrsDay.1"
## [55] "TVHrsDayChild" "ComputerHrsDayChild" "Alcohol12PlusYr"
## [58] "AlcoholDay" "AlcoholYear" "SmokeNow"
## [61] "Smoke100" "SmokeAge" "Marijuana"
## [64] "AgeFirstMarij" "RegularMarij" "AgeRegMarij"
## [67] "HardDrugs" "SexEver" "SexAge"
## [70] "SexNumPartnLife" "SexNumPartYear" "SameSex"
## [73] "SexOrientation" "WTINT2YR" "WTMEC2YR"
## [76] "SDMVPSU" "SDMVSTRA" "WTMEC4YR"
# Number of clusters
NHANESraw %>%
summarize(n_clusters = n_distinct(SDMVSTRA, SDMVPSU))
## n_clusters
## 1 62
# Sample sizes in clusters
NHANESraw %>%
count(SDMVSTRA, SDMVPSU)
## # A tibble: 62 x 3
## SDMVSTRA SDMVPSU n
## <int> <int> <int>
## 1 75 1 379
## 2 75 2 424
## 3 76 1 419
## 4 76 2 366
## 5 77 1 441
## 6 77 2 382
## 7 78 1 378
## 8 78 2 451
## 9 79 1 349
## 10 79 2 347
## # ... with 52 more rows
Chapter 2 - Exploring categorical data
Visualizing categorical variables:
Exploring two categorical variables:
Inference for categorical variables:
Example code includes:
# Specify the survey design
NHANESraw <- mutate(NHANESraw, WTMEC4YR = .5 * WTMEC2YR)
NHANES_design <- svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU, nest = TRUE, weights = ~WTMEC4YR)
# Determine the levels of Depressed
levels(NHANESraw$Depressed)
## [1] "Most" "None" "Several"
# Construct a frequency table of Depressed
tab_w <- svytable(~Depressed, design = NHANES_design)
# Determine class of tab_w
class(tab_w)
## [1] "svytable" "xtabs" "table"
# Display tab_w
tab_w
## Depressed
## Most None Several
## 12704441 158758609 32732508
# Add proportions to table
tab_w <- tab_w %>%
as.data.frame() %>%
mutate(Prop = Freq/sum(Freq))
# Create a barplot
ggplot(data = tab_w, mapping = aes(x = Depressed, y = Prop)) +
geom_col()
# Construct and print a frequency table
tab_D <- svytable(~Depressed, design = NHANES_design)
tab_D
## Depressed
## Most None Several
## 12704441 158758609 32732508
# Construct and print a frequency table
tab_H <- svytable(~HealthGen, design = NHANES_design)
tab_H
## HealthGen
## Excellent Fair Good Poor Vgood
## 27659954 31544030 87497585 5668484 77482169
# Construct and print a frequency table
tab_DH <- svytable(~Depressed + HealthGen, design = NHANES_design)
tab_DH
## HealthGen
## Depressed Excellent Fair Good Poor Vgood
## Most 563613.3 3935505.6 4698948.1 1650509.5 1855864.8
## None 21327181.6 17690782.8 59920031.9 2324945.0 57487318.5
## Several 1870620.9 7355104.8 13950468.6 1253819.6 8302494.5
# Add conditional proportions to tab_DH
tab_DH_cond <- tab_DH %>%
as.data.frame() %>%
group_by(HealthGen) %>%
mutate(n_HealthGen = sum(Freq), Prop_Depressed = Freq/n_HealthGen) %>%
ungroup()
# Print tab_DH_cond
tab_DH_cond
## # A tibble: 15 x 5
## Depressed HealthGen Freq n_HealthGen Prop_Depressed
## <fct> <fct> <dbl> <dbl> <dbl>
## 1 Most Excellent 563613. 23761416. 0.0237
## 2 None Excellent 21327182. 23761416. 0.898
## 3 Several Excellent 1870621. 23761416. 0.0787
## 4 Most Fair 3935506. 28981393. 0.136
## 5 None Fair 17690783. 28981393. 0.610
## 6 Several Fair 7355105. 28981393. 0.254
## 7 Most Good 4698948. 78569449. 0.0598
## 8 None Good 59920032. 78569449. 0.763
## 9 Several Good 13950469. 78569449. 0.178
## 10 Most Poor 1650510. 5229274. 0.316
## 11 None Poor 2324945. 5229274. 0.445
## 12 Several Poor 1253820. 5229274. 0.240
## 13 Most Vgood 1855865. 67645678. 0.0274
## 14 None Vgood 57487318. 67645678. 0.850
## 15 Several Vgood 8302494. 67645678. 0.123
# Create a segmented bar graph of the conditional proportions in tab_DH_cond
ggplot(data = tab_DH_cond, mapping = aes(x = HealthGen, y = Prop_Depressed, fill = Depressed)) +
geom_col() +
coord_flip()
# We can also estimate counts with svytotal(). The syntax is given by:
# svytotal(x = ~interaction(Var1, Var2), design = design, na.rm = TRUE)
# For each combination of the two variables, we get an estimate of the total and the standard error
# Estimate the totals for combos of Depressed and HealthGen
tab_totals <- svytotal(x = ~interaction(Depressed, HealthGen), design = NHANES_design, na.rm = TRUE)
# Print table of totals
tab_totals
## total SE
## interaction(Depressed, HealthGen)Most.Excellent 563613 139689
## interaction(Depressed, HealthGen)None.Excellent 21327182 1556268
## interaction(Depressed, HealthGen)Several.Excellent 1870621 277198
## interaction(Depressed, HealthGen)Most.Fair 3935506 370256
## interaction(Depressed, HealthGen)None.Fair 17690783 1206307
## interaction(Depressed, HealthGen)Several.Fair 7355105 455364
## interaction(Depressed, HealthGen)Most.Good 4698948 501105
## interaction(Depressed, HealthGen)None.Good 59920032 3375068
## interaction(Depressed, HealthGen)Several.Good 13950469 931077
## interaction(Depressed, HealthGen)Most.Poor 1650510 195136
## interaction(Depressed, HealthGen)None.Poor 2324945 251934
## interaction(Depressed, HealthGen)Several.Poor 1253820 168440
## interaction(Depressed, HealthGen)Most.Vgood 1855865 269970
## interaction(Depressed, HealthGen)None.Vgood 57487319 2975806
## interaction(Depressed, HealthGen)Several.Vgood 8302495 687020
# Estimate the means for combos of Depressed and HealthGen
tab_means <- svymean(x = ~interaction(Depressed, HealthGen), design = NHANES_design, na.rm = TRUE)
# Print table of means
tab_means
## mean SE
## interaction(Depressed, HealthGen)Most.Excellent 0.0027603 0.0007
## interaction(Depressed, HealthGen)None.Excellent 0.1044492 0.0053
## interaction(Depressed, HealthGen)Several.Excellent 0.0091613 0.0014
## interaction(Depressed, HealthGen)Most.Fair 0.0192740 0.0019
## interaction(Depressed, HealthGen)None.Fair 0.0866400 0.0047
## interaction(Depressed, HealthGen)Several.Fair 0.0360214 0.0026
## interaction(Depressed, HealthGen)Most.Good 0.0230129 0.0023
## interaction(Depressed, HealthGen)None.Good 0.2934563 0.0092
## interaction(Depressed, HealthGen)Several.Good 0.0683220 0.0033
## interaction(Depressed, HealthGen)Most.Poor 0.0080833 0.0010
## interaction(Depressed, HealthGen)None.Poor 0.0113863 0.0013
## interaction(Depressed, HealthGen)Several.Poor 0.0061405 0.0009
## interaction(Depressed, HealthGen)Most.Vgood 0.0090890 0.0013
## interaction(Depressed, HealthGen)None.Vgood 0.2815422 0.0078
## interaction(Depressed, HealthGen)Several.Vgood 0.0406612 0.0028
# Run a chi square test between Depressed and HealthGen
svychisq(~Depressed + HealthGen, design = NHANES_design, statistic = "Chisq")
##
## Pearson's X^2: Rao & Scott adjustment
##
## data: svychisq(~Depressed + HealthGen, design = NHANES_design, statistic = "Chisq")
## X-squared = 1592.7, df = 8, p-value < 2.2e-16
# Construct a contingency table
tab <- svytable(~Education + HomeOwn, design=NHANES_design)
# Add conditional proportion of levels of HomeOwn for each educational level
tab_df <- as.data.frame(tab) %>%
group_by(Education) %>%
mutate(n_Education = sum(Freq), Prop_HomeOwn = Freq/n_Education) %>%
ungroup()
# Create a segmented bar graph
ggplot(data = tab_df, mapping = aes(x=Education, y=Prop_HomeOwn, fill=HomeOwn)) +
geom_col() +
coord_flip()
# Run a chi square test
svychisq(~Education + HomeOwn,
design = NHANES_design,
statistic = "Chisq")
##
## Pearson's X^2: Rao & Scott adjustment
##
## data: svychisq(~Education + HomeOwn, design = NHANES_design, statistic = "Chisq")
## X-squared = 531.78, df = 8, p-value = 2.669e-16
Chapter 3 - Exploring quantitative data
Summarizing quantitative data:
Visualizing quantitative data:
Inference for quantitative data:
Example code includes:
# Compute the survey-weighted mean
svymean(x = ~SleepHrsNight, design = NHANES_design, na.rm = TRUE)
## mean SE
## SleepHrsNight 6.9292 0.0166
# Compute the survey-weighted mean by Gender
svyby(formula = ~SleepHrsNight, by = ~Gender, design = NHANES_design,
FUN = svymean, na.rm = TRUE, keep.names = FALSE
)
## Gender SleepHrsNight se
## 1 female 6.976103 0.02374684
## 2 male 6.879050 0.01953263
# Compute the survey-weighted quantiles
svyquantile(x = ~SleepHrsNight, design = NHANES_design, na.rm = TRUE,
quantiles = c(0.01, 0.25, 0.5, 0.75, .99)
)
## 0.01 0.25 0.5 0.75 0.99
## SleepHrsNight 4 6 7 8 10
# Compute the survey-weighted quantiles by Gender
svyby(formula = ~SleepHrsNight, by = ~Gender, design = NHANES_design, FUN = svyquantile,
na.rm = TRUE, quantiles = c(0.5), keep.rows = FALSE, keep.var = FALSE
)
## Gender statistic
## female female 7
## male male 7
# Compute the survey-weighted mean by Gender
out <- svyby(formula = ~SleepHrsNight, by = ~Gender, design = NHANES_design,
FUN = svymean, na.rm = TRUE, keep.names = FALSE
)
# Construct a bar plot of average sleep by gender
ggplot(data = out, mapping = aes(x=as.factor(Gender), y=SleepHrsNight)) +
geom_col() +
labs(y="Average Nightly Sleep")
# Add lower and upper columns to out
out_col <- mutate(out, lower = SleepHrsNight - 2*se, upper = SleepHrsNight + 2*se)
# Construct a bar plot of average sleep by gender with error bars
ggplot(data = out_col, mapping = aes(x = Gender, y = SleepHrsNight, ymin = lower, ymax = upper)) +
geom_col(fill = "gold") +
labs(y = "Average Nightly Sleep") +
geom_errorbar(width = 0.7)
# Create a histogram with a set binwidth
ggplot(data = NHANESraw, mapping = aes(x=SleepHrsNight, weight=WTMEC4YR)) +
geom_histogram(binwidth = 1, color = "white") +
labs(x = "Hours of Sleep")
## Warning: Removed 7261 rows containing non-finite values (stat_bin).
# Create a histogram with a set binwidth
ggplot(data = NHANESraw, mapping = aes(x=SleepHrsNight, weight=WTMEC4YR)) +
geom_histogram(binwidth = 0.5, color = "white") +
labs(x = "Hours of Sleep")
## Warning: Removed 7261 rows containing non-finite values (stat_bin).
# Create a histogram with a set binwidth
ggplot(data = NHANESraw, mapping = aes(x=SleepHrsNight, weight=WTMEC4YR)) +
geom_histogram(binwidth = 2, color = "white") +
labs(x = "Hours of Sleep")
## Warning: Removed 7261 rows containing non-finite values (stat_bin).
# Density plot of sleep faceted by gender
NHANESraw %>%
filter(!is.na(SleepHrsNight), !is.na(Gender)) %>%
group_by(Gender) %>%
mutate(WTMEC4YR_std = WTMEC4YR/sum(WTMEC4YR)) %>%
ggplot(mapping = aes(x = SleepHrsNight, weight = WTMEC4YR_std)) +
geom_density(bw = 0.6, fill = "gold") +
labs(x = "Hours of Sleep") +
facet_wrap(~Gender, labeller = "label_both")
# Run a survey-weighted t-test
svyttest(formula = SleepHrsNight ~ Gender, design = NHANES_design)
## Warning in summary.glm(g): observations with zero weight not used for
## calculating dispersion
## Warning in summary.glm(glm.object): observations with zero weight not used
## for calculating dispersion
##
## Design-based t-test
##
## data: SleepHrsNight ~ Gender
## t = -3.4077, df = 32, p-value = 0.001785
## alternative hypothesis: true difference in mean is not equal to 0
## 95 percent confidence interval:
## -0.15287218 -0.04123256
## sample estimates:
## difference in mean
## -0.09705237
# Find means of total cholesterol by whether or not active
out <- svyby(formula = ~TotChol, by = ~PhysActive, design = NHANES_design,
FUN = svymean, na.rm = TRUE, keep.names = FALSE
)
# Construct a bar plot of means of total cholesterol by whether or not active
ggplot(data = out, mapping = aes(x=PhysActive, y=TotChol)) +
geom_col()
# Run t test for difference in means of total cholesterol by whether or not active
svyttest(formula = TotChol ~ PhysActive, design = NHANES_design)
##
## Design-based t-test
##
## data: TotChol ~ PhysActive
## t = -3.7936, df = 32, p-value = 0.0006232
## alternative hypothesis: true difference in mean is not equal to 0
## 95 percent confidence interval:
## -0.20053677 -0.06390939
## sample estimates:
## difference in mean
## -0.1322231
Chapter 4 - Modeling quantitative data
Visualization with scatter plots:
Visualizing trends:
Modeling survey data:
More complex modeling:
Wrap up:
Example code includes:
# Create dataset with only 20 year olds
NHANES20 <- filter(NHANESraw, Age == 20)
# Construct scatter plot
ggplot(data = NHANES20, mapping = aes(x=Height, y=Weight)) +
geom_point(alpha = 0.3) +
guides(size = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
# Construct bubble plot
ggplot(data = NHANES20, mapping = aes(x=Height, y=Weight, size=WTMEC4YR)) +
geom_point(alpha = 0.3) +
guides(size = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
# Construct a scatter plot
ggplot(data = NHANES20, mapping = aes(x=Height, y=Weight, color=WTMEC4YR)) +
geom_point() +
guides(color = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
# Construct a scatter plot
ggplot(data = NHANES20, mapping = aes(x=Height, y=Weight, alpha=WTMEC4YR)) +
geom_point() +
guides(alpha = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
# Add gender to plot
ggplot(data = NHANES20, mapping = aes(x=Height, y=Weight, size=WTMEC4YR, color=Gender)) +
geom_point(alpha=0.3) +
guides(size = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
# Add gender to plot
ggplot(data = NHANES20, mapping = aes(x=Height, y=Weight, alpha=WTMEC4YR, color=Gender)) +
geom_point() +
guides(alpha = FALSE)
## Warning: Removed 4 rows containing missing values (geom_point).
# Bubble plot with linear of best fit
ggplot(data = NHANESraw, mapping = aes(x = Height, y = Weight, size=WTMEC4YR)) +
geom_point(alpha = 0.1) +
guides(size = FALSE) +
geom_smooth(method = "lm", se = FALSE, mapping = aes(weight=WTMEC4YR))
## Warning: Removed 2279 rows containing non-finite values (stat_smooth).
## Warning: Removed 2279 rows containing missing values (geom_point).
# Add quadratic curve and cubic curve
ggplot(data = NHANESraw, mapping = aes(x = Height, y = Weight, size = WTMEC4YR)) +
geom_point(alpha = 0.1) +
guides(size = FALSE) +
geom_smooth(method = "lm", se = FALSE, mapping = aes(weight = WTMEC4YR)) +
geom_smooth(method = "lm", se = FALSE, mapping = aes(weight = WTMEC4YR), formula = y ~ poly(x, 2), color = "orange") +
geom_smooth(method = "lm", se = FALSE, mapping = aes(weight = WTMEC4YR), formula = y ~ poly(x, 3), color = "red")
## Warning: Removed 2279 rows containing non-finite values (stat_smooth).
## Warning: Removed 2279 rows containing non-finite values (stat_smooth).
## Warning: Removed 2279 rows containing non-finite values (stat_smooth).
## Warning: Removed 2279 rows containing missing values (geom_point).
# Add survey-weighted trend lines to bubble plot
ggplot(data = NHANES20, mapping = aes(x = Height, y = Weight, size = WTMEC4YR, color = Gender)) +
geom_point(alpha = 0.1) +
guides(size = FALSE) +
geom_smooth(method = "lm", se = FALSE, linetype = 2)
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).
# Add non-survey-weighted trend lines
ggplot(data = NHANES20, mapping = aes(x = Height, y = Weight, size = WTMEC4YR, color = Gender)) +
geom_point(alpha = 0.1) +
guides(size = FALSE) +
geom_smooth(method = "lm", se = FALSE, linetype = 2) +
geom_smooth(method = "lm", se = FALSE, mapping = aes(weight=WTMEC4YR))
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).
# Subset survey design object to only include 20 year olds
NHANES20_design <- subset(NHANES_design, Age == 20)
# Build a linear regression model
mod <- svyglm(Weight ~ Height, design = NHANES20_design)
# Print summary of the model
summary(mod)
##
## Call:
## svyglm(formula = Weight ~ Height, design = NHANES20_design)
##
## Survey design:
## subset(NHANES_design, Age == 20)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -67.2571 22.9836 -2.926 0.00674 **
## Height 0.8305 0.1368 6.072 1.51e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 326.6108)
##
## Number of Fisher Scoring iterations: 2
# Build a linear regression model same slope
mod1 <- svyglm(Weight ~ Height + Gender, design = NHANES20_design)
# Print summary of the same slope model
summary(mod1)
##
## Call:
## svyglm(formula = Weight ~ Height + Gender, design = NHANES20_design)
##
## Survey design:
## subset(NHANES_design, Age == 20)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -53.8665 22.7622 -2.366 0.0254 *
## Height 0.7434 0.1391 5.346 1.2e-05 ***
## Gendermale 2.7207 3.2471 0.838 0.4095
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 325.3881)
##
## Number of Fisher Scoring iterations: 2
# Build a linear regression model different slopes
mod2 <- svyglm(Weight ~ Height*Gender, design = NHANES20_design)
# Print summary of the different slopes model
summary(mod2)
##
## Call:
## svyglm(formula = Weight ~ Height * Gender, design = NHANES20_design)
##
## Survey design:
## subset(NHANES_design, Age == 20)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.5061 21.5357 0.441 0.66257
## Height 0.3565 0.1269 2.809 0.00932 **
## Gendermale -131.0884 41.9989 -3.121 0.00438 **
## Height:Gendermale 0.7897 0.2385 3.311 0.00273 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 316.5007)
##
## Number of Fisher Scoring iterations: 2
# Plot BPDiaAve and BPSysAve by Diabetes and include trend lines
drop_na(NHANESraw, Diabetes) %>%
ggplot(mapping = aes(x=BPDiaAve, y=BPSysAve, size=WTMEC4YR, color=Diabetes)) +
geom_point(alpha = 0.2) +
guides(size = FALSE) +
geom_smooth(method="lm", se = FALSE, mapping = aes(weight=WTMEC4YR))
## Warning: Removed 4600 rows containing non-finite values (stat_smooth).
## Warning: Removed 4600 rows containing missing values (geom_point).
# Build simple linear regression model
mod1 <- svyglm(BPSysAve ~ BPDiaAve, design = NHANES_design)
# Build model with different slopes
mod2 <- svyglm(BPSysAve ~ BPDiaAve*Diabetes, design = NHANES_design)
# Summarize models
summary(mod1)
##
## Call:
## svyglm(formula = BPSysAve ~ BPDiaAve, design = NHANES_design)
##
## Survey design:
## svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU,
## nest = TRUE, weights = ~WTMEC4YR)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85.74311 1.86920 45.87 <2e-16 ***
## BPDiaAve 0.48150 0.02354 20.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 290.3472)
##
## Number of Fisher Scoring iterations: 2
summary(mod2)
##
## Call:
## svyglm(formula = BPSysAve ~ BPDiaAve * Diabetes, design = NHANES_design)
##
## Survey design:
## svydesign(data = NHANESraw, strata = ~SDMVSTRA, id = ~SDMVPSU,
## nest = TRUE, weights = ~WTMEC4YR)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.58652 2.05537 40.667 < 2e-16 ***
## BPDiaAve 0.49964 0.02623 19.047 < 2e-16 ***
## DiabetesYes 25.36616 3.56587 7.114 6.53e-08 ***
## BPDiaAve:DiabetesYes -0.22132 0.05120 -4.323 0.000156 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 279.1637)
##
## Number of Fisher Scoring iterations: 2
Chapter 1 - Inference for a Single Parameter
General Social Survey:
CI interpretations:
Approximation shortcut:
Example code includes:
load("./RInputFiles/gss.RData")
glimpse(gss)
## Observations: 50,346
## Variables: 28
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ year <dbl> 1982, 1982, 1982, 1982, 1982, 1982, 1982, 1982, 1982,...
## $ age <fct> 41, 49, 27, 24, 57, 29, 21, 68, 54, 80, 74, 30, 53, 3...
## $ class <fct> WORKING CLASS, WORKING CLASS, MIDDLE CLASS, MIDDLE CL...
## $ degree <fct> LT HIGH SCHOOL, HIGH SCHOOL, HIGH SCHOOL, HIGH SCHOOL...
## $ sex <fct> MALE, FEMALE, FEMALE, FEMALE, MALE, MALE, FEMALE, MAL...
## $ marital <fct> MARRIED, MARRIED, NEVER MARRIED, NEVER MARRIED, NEVER...
## $ race <fct> WHITE, WHITE, WHITE, WHITE, WHITE, WHITE, WHITE, WHIT...
## $ region <fct> NEW ENGLAND, NEW ENGLAND, NEW ENGLAND, NEW ENGLAND, N...
## $ partyid <fct> STRONG DEMOCRAT, STRONG DEMOCRAT, IND,NEAR DEM, IND,N...
## $ happy <fct> PRETTY HAPPY, NOT TOO HAPPY, VERY HAPPY, PRETTY HAPPY...
## $ grass <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ relig <fct> CATHOLIC, CATHOLIC, CATHOLIC, CATHOLIC, CATHOLIC, CAT...
## $ cappun2 <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ cappun <fct> FAVOR, FAVOR, FAVOR, OPPOSE, OPPOSE, FAVOR, OPPOSE, F...
## $ finalter <fct> STAYED SAME, WORSE, BETTER, BETTER, STAYED SAME, BETT...
## $ protest3 <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ natspac <fct> ABOUT RIGHT, TOO MUCH, TOO LITTLE, TOO LITTLE, ABOUT ...
## $ natarms <fct> TOO LITTLE, TOO LITTLE, ABOUT RIGHT, TOO MUCH, TOO LI...
## $ conclerg <fct> ONLY SOME, ONLY SOME, A GREAT DEAL, ONLY SOME, A GREA...
## $ confed <fct> ONLY SOME, ONLY SOME, ONLY SOME, ONLY SOME, A GREAT D...
## $ conpress <fct> ONLY SOME, ONLY SOME, A GREAT DEAL, ONLY SOME, A GREA...
## $ conjudge <fct> HARDLY ANY, ONLY SOME, A GREAT DEAL, A GREAT DEAL, A ...
## $ consci <fct> ONLY SOME, ONLY SOME, A GREAT DEAL, A GREAT DEAL, A G...
## $ conlegis <fct> ONLY SOME, ONLY SOME, ONLY SOME, ONLY SOME, A GREAT D...
## $ zodiac <fct> TAURUS, CAPRICORN, VIRGO, PISCES, CAPRICORN, LEO, LIB...
## $ oversamp <dbl> 1.235, 1.235, 1.235, 1.235, 1.235, 1.235, 1.235, 1.23...
## $ postlife <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
# Subset data from 2016
gss2016 <- gss %>%
filter(year == 2016)
gss2016 %>% count(consci)
## # A tibble: 4 x 2
## consci n
## <fct> <int>
## 1 A GREAT DEAL 791
## 2 ONLY SOME 976
## 3 HARDLY ANY 117
## 4 <NA> 983
gss2016 <- gss2016 %>%
mutate(old_consci=consci,
consci=fct_other(fct_recode(old_consci, "High"="A GREAT DEAL"), keep="High", other_level="Low")
)
gss2016 %>% count(consci)
## # A tibble: 3 x 2
## consci n
## <fct> <int>
## 1 High 791
## 2 Low 1093
## 3 <NA> 983
# Plot distribution of consci
ggplot(gss2016, aes(x = consci)) +
geom_bar()
# Compute proportion of high conf
p_hat <- gss2016 %>%
summarize(p = mean(consci == "High", na.rm = TRUE)) %>%
pull()
# Load the infer package
library(infer)
# Create single bootstrap data set
b1 <- gss2016 %>%
specify(response = consci, success = "High") %>%
generate(reps = 1, type = "bootstrap")
## Warning: Removed 983 rows containing missing values.
# Plot distribution of consci
ggplot(b1, aes(x = consci)) +
geom_bar()
# Compute proportion with high conf
b1 %>%
summarize(p = mean(consci == "High")) %>%
pull()
## [1] 0.4187898
# Create bootstrap distribution for proportion that favor
boot_dist <- gss2016 %>%
specify(response = consci, success = "High") %>%
generate(reps = 500) %>%
calculate(stat = "prop", success = "High", na.rm = TRUE)
## Warning: Removed 983 rows containing missing values.
# Plot distribution
ggplot(boot_dist, aes(x=stat)) +
geom_density()
# Compute estimate of SE
SE <- boot_dist %>%
summarize(se = sd(stat)) %>%
pull()
# Create CI
c(p_hat - 2*SE, p_hat + 2*SE)
## [1] 0.3964511 0.4432517
# Two new smaller data sets have been created for you from gss2016: gss2016_small, which contains 50 observations, and gss2016_smaller which contains just 10 observations
id50 <- c(6, 98, 2673, 1435, 1535, 525, 2784, 1765, 163, 1859, 2497, 1780, 184, 575, 2781, 2310, 1677, 2478, 1226, 2350, 1139, 1635, 1350, 1809, 1842, 1501, 1502, 2610, 2456, 49, 56, 2167, 2401, 2002, 2343, 2012, 860, 2557, 1147, 1119, 2449, 695, 1511, 666, 1595, 1094, 2643, 769, 1263, 2426)
id10 <- c(1609, 1342, 2066, 2710, 1809, 503, 1889, 486, 1469, 6)
gss2016_small <- gss2016 %>%
filter(id %in% id50)
gss2016_smaller <- gss2016 %>%
filter(id %in% id10)
# Create bootstrap distribution for proportion
boot_dist_small <- gss2016_small %>%
specify(response = consci, success = "High") %>%
generate(reps = 500, type = "bootstrap") %>%
calculate(stat = "prop")
# Compute estimate of SE
SE_small_n <- boot_dist_small %>%
summarize(se = sd(stat)) %>%
pull()
# Create bootstrap distribution for proportion
boot_dist_smaller <- gss2016_smaller %>%
specify(response = consci, success = "High") %>%
generate(reps = 500, type = "bootstrap") %>%
calculate(stat = "prop")
# Compute estimate of SE
SE_smaller_n <- boot_dist_smaller %>%
summarize(se = sd(stat)) %>%
pull()
c(SE_small_n, SE_smaller_n)
## [1] 0.07206823 0.14608464
# Create bootstrap distribution for proportion that have hardy any
boot_dist <- gss2016 %>%
specify(response=consci, success = "Low") %>%
generate(reps=500, type="bootstrap") %>%
calculate(stat = "prop", na.rm = TRUE)
## Warning: Removed 983 rows containing missing values.
# Compute estimate of SE
SE_low_p <- boot_dist %>%
summarize(se = sd(stat)) %>%
pull()
# Compute p-hat and n
p_hat <- gss2016_small %>%
summarize(p = mean(consci == "High", na.rm=TRUE)) %>%
pull()
n <- nrow(gss2016_small)
# Check conditions
p_hat * n >= 10
## [1] TRUE
(1 - p_hat) * n >= 10
## [1] TRUE
# Calculate SE
SE_approx <- sqrt(p_hat * (1 - p_hat) / n)
# Form 95% CI
c(p_hat - 2 * SE_approx, p_hat + 2 * SE_approx)
## [1] 0.242712 0.517288
Chapter 2 - Proportions (Testing and Power)
Hypothesis test for a proportion:
Intervals for differences:
Statistical errors:
Example code includes:
# Construct plot
ggplot(gss2016, aes(x = postlife)) +
geom_bar()
# Compute and save proportion that believe
p_hat <- gss2016 %>%
summarize(mean(postlife == "YES", na.rm = TRUE)) %>%
pull()
# Generate one data set under H0
sim1 <- gss2016 %>%
specify(response = postlife, success = "YES") %>%
hypothesize(null = "point", p = 0.75) %>%
generate(reps = 1, type = "simulate")
## Warning: Removed 279 rows containing missing values.
# Construct plot
ggplot(sim1, aes(x=postlife)) +
geom_bar()
# Compute proportion that believe
sim1 %>%
summarize(mean(postlife == "YES")) %>%
pull()
## [1] 0.7472952
# Generate null distribution
null <- gss2016 %>%
specify(response = postlife, success = "YES") %>%
hypothesize(null = "point", p = .75) %>%
generate(reps = 100, type = "simulate") %>%
calculate(stat = "prop")
## Warning: Removed 279 rows containing missing values.
# Visualize null distribution
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = p_hat, color = "red")
# Compute the two-tailed p-value
null %>%
summarize(mean(stat > p_hat)) %>%
pull() * 2
## [1] 0
# Plot distribution
ggplot(gss2016, aes(x = sex, fill = cappun)) +
geom_bar(position = "fill")
# Compute two proportions
p_hats <- gss2016 %>%
group_by(sex) %>%
summarize(mean(cappun == "FAVOR", na.rm = TRUE)) %>%
pull()
# Compute difference in proportions
d_hat <- diff(p_hats)
# Create null distribution
null <- gss2016 %>%
specify(cappun ~ sex, success = "FAVOR") %>%
hypothesize(null = "independence") %>%
generate(reps = 500, type = "permute") %>%
calculate(stat = "diff in props", order = c("FEMALE", "MALE"))
## Warning: Removed 172 rows containing missing values.
# Visualize null
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = d_hat, col = "red")
# Compute two-tailed p-value
null %>%
summarize(mean(stat < d_hat)) %>%
pull() * 2
## [1] 0
# Create the bootstrap distribution
boot <- gss2016 %>%
specify(cappun ~ sex, success="FAVOR") %>%
generate(reps=500, type="bootstrap") %>%
calculate(stat = "diff in props", order = c("FEMALE", "MALE"))
## Warning: Removed 172 rows containing missing values.
# Compute the standard error
SE <- boot %>%
summarize(sd(stat)) %>%
pull()
# Form the CI (lower, upper)
c( d_hat - 2*SE, d_hat + 2*SE )
## [1] -0.12636862 -0.05205316
gssmod <- gss2016 %>%
mutate(coinflip=sample(c("heads", "tails"), size=nrow(.), replace=TRUE))
table(gssmod$coinflip)
##
## heads tails
## 1434 1433
# Find difference in props
p_hats <- gssmod %>%
group_by(coinflip) %>%
summarize(mean(cappun == "FAVOR", na.rm = TRUE)) %>%
pull()
# Compute difference in proportions
d_hat <- diff(p_hats)
# Form null distribution
null <- gssmod %>%
specify(cappun ~ coinflip, success = "FAVOR") %>%
hypothesize(null = "independence") %>%
generate(reps = 500, type = "permute") %>%
calculate(stat = "diff in props", order = c("heads", "tails"))
## Warning: Removed 172 rows containing missing values.
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = d_hat, color = "red")
# Set alpha
alpha <- 0.05
# Find cutoffs
upper <- null %>%
summarize(quantile(stat, probs = c(1-alpha/2))) %>%
pull()
lower <- null %>%
summarize(quantile(stat, probs = alpha/2)) %>%
pull()
# Visualize cutoffs
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = d_hat, color = "red") +
geom_vline(xintercept = lower, color = "blue") +
geom_vline(xintercept = upper, color = "blue")
# check if inside cutoffs
d_hat %>%
between(lower, upper)
## [1] TRUE
Chapter 3 - Comparing Many Parameters (Independence)
Contingency tables:
Chi-squared test statistic:
Alternative method - chi-squared test statistic:
Intervals for chi-squared:
Example code includes:
# Exclude "other" party
gss_party <- gss2016 %>%
mutate(party=fct_collapse(partyid,
"D"=c("STRONG DEMOCRAT", "NOT STR DEMOCRAT"),
"R"=c("NOT STR REPUBLICAN", "STRONG REPUBLICAN"),
"I"=c("IND,NEAR DEM", "INDEPENDENT", "IND,NEAR REP"),
"O"="OTHER PARTY"
)
) %>%
filter(!is.na(party), party != "O") %>%
droplevels()
# Bar plot of proportions
gss_party %>%
ggplot(aes(x = party, fill = natspac)) +
geom_bar(position = "fill")
# Bar plot of counts
gss_party %>%
ggplot(aes(x=party, fill = natspac)) +
geom_bar()
# Create table of natspac and party
O <- gss_party %>%
select(natspac, party) %>%
table()
# Convert table back to tidy df
O %>%
broom::tidy() %>%
uncount(n)
## # A tibble: 1,249 x 2
## natspac party
## <chr> <chr>
## 1 TOO LITTLE D
## 2 TOO LITTLE D
## 3 TOO LITTLE D
## 4 TOO LITTLE D
## 5 TOO LITTLE D
## 6 TOO LITTLE D
## 7 TOO LITTLE D
## 8 TOO LITTLE D
## 9 TOO LITTLE D
## 10 TOO LITTLE D
## # ... with 1,239 more rows
# Create one permuted data set
perm_1 <- gss_party %>%
specify(natarms ~ party) %>%
hypothesize(null = "independence") %>%
generate(reps = 1, type = "permute")
## Warning: Removed 1412 rows containing missing values.
# Visualize permuted data
ggplot(perm_1, aes(x = party, fill = natarms)) +
geom_bar()
# Make contingency table
tab <- perm_1 %>%
ungroup() %>%
select(natarms, party) %>%
table()
# Compute chi-squared stat
(chi_obs_arms <- chisq.test(tab)$statistic)
## X-squared
## 1.34665
(chi_obs_spac <- chisq.test(gss_party$natspac, gss_party$party)$statistic)
## X-squared
## 7.568185
# Create null
null <- gss_party %>%
specify(natspac ~ party) %>%
hypothesize(null = "independence") %>%
generate(reps = 100, type = "permute") %>%
calculate(stat = "Chisq")
## Warning: Removed 1514 rows containing missing values.
# Visualize H_0 and obs
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = chi_obs_spac, color = "red")
# Create null
null <- gss_party %>%
specify(natarms ~ party) %>%
hypothesize(null = "independence") %>%
generate(reps = 100, type = "permute") %>%
calculate(stat = "Chisq")
## Warning: Removed 1412 rows containing missing values.
# Visualize H_0 and obs
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = chi_obs_arms, color = "red")
# create bar plot
gss2016 %>%
ggplot(aes(x = region, fill = happy)) +
geom_bar(position = "fill") +
coord_flip()
# create table
tab <- gss2016 %>%
select(happy, region) %>%
table()
# compute observed statistic
(chi_obs_stat <- chisq.test(tab)$statistic)
## X-squared
## 12.60899
# generate null distribution
null <- gss2016 %>%
mutate(happy=fct_other(happy, keep=c("VERY HAPPY"))) %>%
specify(happy ~ region, success = "VERY HAPPY") %>%
hypothesize(null = "independence") %>%
generate(reps = 500, type = "permute") %>%
calculate(stat = "Chisq")
## Warning: Removed 8 rows containing missing values.
# plot null(s)
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = chi_obs_stat) +
stat_function(fun = dchisq, args = list(df = (9-1)*(2-1)), color = "blue")
# permutation p-value
null %>%
summarize(mean(stat > chi_obs_stat)) %>%
pull()
## [1] 0.116
# approximation p-value
1 - pchisq(chi_obs_stat, df = (9-1)*(2-1))
## X-squared
## 0.1260301
Chapter 4 - Comparing Many Parameters (Goodness of Fit)
Case Study: Election Fraud:
Goodness of Fit:
Now to the US:
Wrap-Up:
Example code includes:
iran <- readr::read_csv("./RInputFiles/iran.csv")
## Parsed with column specification:
## cols(
## province = col_character(),
## city = col_character(),
## ahmadinejad = col_integer(),
## rezai = col_integer(),
## karrubi = col_integer(),
## mousavi = col_integer(),
## total_votes_cast = col_integer(),
## voided_votes = col_integer(),
## legitimate_votes = col_integer()
## )
glimpse(iran)
## Observations: 366
## Variables: 9
## $ province <chr> "East Azerbaijan", "East Azerbaijan", "East A...
## $ city <chr> "Azar Shahr", "Asko", "Ahar", "Bostan Abad", ...
## $ ahmadinejad <int> 37203, 32510, 47938, 38610, 36395, 435728, 20...
## $ rezai <int> 453, 481, 568, 281, 485, 9830, 166, 55, 442, ...
## $ karrubi <int> 138, 468, 173, 53, 190, 3513, 74, 46, 211, 12...
## $ mousavi <int> 18312, 18799, 26220, 12603, 33695, 419983, 14...
## $ total_votes_cast <int> 56712, 52643, 75500, 51911, 71389, 876919, 35...
## $ voided_votes <int> 606, 385, 601, 364, 624, 7865, 195, 102, 634,...
## $ legitimate_votes <int> 56106, 52258, 74899, 51547, 70765, 869054, 35...
# Compute candidate totals
totals <- iran %>%
summarize(ahmadinejad = sum(ahmadinejad),
rezai = sum(rezai),
karrubi = sum(karrubi),
mousavi = sum(mousavi))
# Plot totals
totals %>%
gather(key = "candidate", value = "votes") %>%
ggplot(aes(x = candidate, y = votes)) +
geom_bar(stat = "identity")
# Cities won by #2
iran %>%
group_by(province) %>%
summarize(ahmadinejad = sum(ahmadinejad),
mousavi = sum(mousavi)) %>%
mutate(mousavi_win = mousavi > ahmadinejad) %>%
filter(mousavi_win)
## # A tibble: 2 x 4
## province ahmadinejad mousavi mousavi_win
## <chr> <int> <int> <lgl>
## 1 Sistan and Baluchestan 450269 507946 TRUE
## 2 West Azerbaijan 623946 656508 TRUE
# Print get_first
get_first <- function(x) {
substr(as.character(x), 1, 1) %>%
as.numeric() %>%
as.factor()
}
# Create first_digit
iran2 <- iran %>%
mutate(first_digit = get_first(total_votes_cast))
# Construct barchart
iran2 %>%
ggplot(aes(x=first_digit)) +
geom_bar()
# Tabulate the counts of each digit
tab <- iran2 %>%
select(first_digit) %>%
table()
# Compute observed stat
p_benford <- c(0.301029995663981, 0.176091259055681, 0.1249387366083, 0.0969100130080564, 0.0791812460476248, 0.0669467896306132, 0.0579919469776867, 0.0511525224473813, 0.0457574905606751)
names(p_benford) <- 1:9
p_benford[9] <- 1 - sum(p_benford[-9])
sum(p_benford)
## [1] 1
chi_obs_stat <- chisq.test(tab, p = p_benford)$stat
# Form null distribution
null <- iran2 %>%
specify(response=first_digit) %>%
hypothesize(null = "point", p = p_benford) %>%
generate(reps=500, type = "simulate") %>%
calculate(stat = "Chisq")
# plot both nulls
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = chi_obs_stat) +
stat_function(fun = dchisq, args = list(df = 9-1), color = "blue")
# permutation p-value
null %>%
summarize(mean(stat > chi_obs_stat)) %>%
pull()
## [1] 0.006
# approximation p-value
pchisq(chi_obs_stat, df=9-1, lower.tail=FALSE)
## X-squared
## 0.006836367
iowa <- readr::read_csv("./RInputFiles/iowa.csv")
## Parsed with column specification:
## cols(
## office = col_character(),
## candidate = col_character(),
## party = col_character(),
## county = col_character(),
## votes = col_integer()
## )
glimpse(iowa)
## Observations: 1,386
## Variables: 5
## $ office <chr> "President/Vice President", "President/Vice Presiden...
## $ candidate <chr> "Evan McMullin / Nathan Johnson", "Under Votes", "Ga...
## $ party <chr> "Nominated by Petition", NA, "Libertarian", NA, "Soc...
## $ county <chr> "Adair", "Adair", "Adair", "Adair", "Adair", "Adair"...
## $ votes <int> 10, 32, 127, 5, 0, 10, 1133, 14, 3, 2461, 3848, 38, ...
# Get R+D county totals
iowa2 <- iowa %>%
filter(candidate == "Hillary Clinton / Tim Kaine" | candidate == "Donald Trump / Mike Pence") %>%
group_by(county) %>%
summarize(dem_rep_votes = sum(votes, na.rm = TRUE))
# Add first_digit
iowa3 <- iowa2 %>%
mutate(first_digit = get_first(dem_rep_votes))
# Construct bar plot
iowa3 %>%
ggplot(aes(x=first_digit)) +
geom_bar()
# Tabulate the counts of each digit
tab <- iowa3 %>%
select(first_digit) %>%
table()
# Compute observed stat
chi_obs_stat <- chisq.test(tab, p = p_benford)$stat
## Warning in chisq.test(tab, p = p_benford): Chi-squared approximation may be
## incorrect
# Form null distribution
null <- iowa3 %>%
specify(response = first_digit) %>%
hypothesize(null = "point", p = p_benford) %>%
generate(reps = 500, type = "simulate") %>%
calculate(stat = "Chisq")
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
## Warning in stats::chisq.test(table(first_digit), p = attr(x, "params")):
## Chi-squared approximation may be incorrect
# Visualize null
ggplot(null, aes(x = stat)) +
geom_density() +
geom_vline(xintercept = chi_obs_stat)
Chapter 1 - Dashboard Layouts
Introduction:
Anatomy of flexdashboard:
Layout basics:
Advanced layouts:
Example code includes (not added due to need for separate dashboard file):
Chapter 2 - Data Visualization for Dashboards
Graphs:
Web-Friendly Visualizations:
htmlwidgets:
Example code includes (not added due to need for separate dashboard file):
Chapter 3 - Dashboard Components
Highlighting Single Values:
Dashboard Tables:
Text for Dashboards:
Example code includes (not added due to need for separate dashboard file):
Chapter 4 - Adding Interactivity with Shiny
Incorporating Shiny into Dashboards:
Reactive Dataframe Pattern:
Customized Inputs for Charts:
Wrap-up:
Example code includes (not added due to need for separate dashboard file):
Chapter 1 - Exploring Graphs Through Time
Exploring Data Set:
Exploring Temporal Structure:
Example code includes:
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
amzn_g <- read.graph("./RInputFiles/amzn_g.gml", format=c("gml"))
amzn_g
## IGRAPH ebb35d8 DN-- 10245 10754 --
## + attr: id (v/n), name (v/c)
## + edges from ebb35d8 (vertex names):
## [1] 44 ->42 179 ->71 410 ->730 415 ->741 656 ->1267 669 ->672
## [7] 672 ->669 689 ->690 689 ->1284 690 ->689 690 ->1284 730 ->410
## [13] 741 ->909 786 ->1767 802 ->806 806 ->802 856 ->205 857 ->211
## [19] 867 ->866 868 ->866 909 ->741 911 ->748 921 ->190 1015->151
## [25] 1016->1015 1047->1049 1049->1047 1204->1491 1267->656 1272->669
## [31] 1278->152 1282->943 1284->689 1285->1286 1286->1285 1290->1293
## [37] 1293->1290 1293->1606 1294->1295 1295->1294 1312->730 1350->2783
## [43] 1362->156 1366->190 1438->1580 1438->1581 1467->3996 1479->158
## + ... omitted several edges
# Perform dyad census
dc <- dyad_census(amzn_g)
# Perform triad census
tc <- triad_census(amzn_g)
# Find the edge density
ed <- edge_density(amzn_g)
# Output values
print(dc)
## $mut
## [1] 3199
##
## $asym
## [1] 4356
##
## $null
## [1] 52467335
print(tc)
## [1] 179089386743 44610360 32763436 215 1906
## [6] 507 1198 457 118 0
## [11] 301 170 119 33 239
## [16] 288
print(ed)
## [1] 0.0001024681
# Calculate transitivity
transitivity(amzn_g)
## [1] 0.3875752
# Calculate reciprocity
amzn_rp <- reciprocity(amzn_g)
# Simulate our outputs
nv <- gorder(amzn_g)
ed <- edge_density(amzn_g)
rep_sim <- rep(NA, 1000)
# Simulate
for(i in 1:1000){
rep_sim[i] <- reciprocity(erdos.renyi.game(nv, ed, "gnp", directed = TRUE))
}
# Compare
quantile(rep_sim, c(0.25, .5, 0.975))
## 25% 50% 97.5%
## 0.0000000000 0.0000000000 0.0005504297
print(amzn_rp)
## [1] 0.5949414
# Get the distribution of in and out degrees
table(degree(amzn_g, mode = "in"))
##
## 0 1 2 3 4 5 6 7 8 9 11 12 17
## 2798 5240 1549 424 139 50 20 7 9 5 1 2 1
table(degree(amzn_g, mode = "out"))
##
## 0 1 2 3 4 5
## 1899 6350 1635 313 45 3
# Find important products based on the ratio of out to in and look for extremes
imp_prod <- V(amzn_g)[degree(amzn_g, mode = "out") > 3 & degree(amzn_g, mode = "in") < 3]
## Output the vertices
print(imp_prod)
## + 8/10245 vertices, named, from ebb35d8:
## [1] 1629 4545 6334 20181 62482 64344 155513 221085
ipFrom <- c(1629, 1629, 1629, 1629, 1629, 1629, 1629, 1629, 1629, 1629, 1629, 11163, 11163, 11163, 11163, 11163, 11163, 11163, 11163, 11163, 11163, 11163, 32129, 32129, 32129, 32129, 32129, 32129, 32129, 38131, 38131, 38131, 38131, 38131, 38131, 45282, 45282, 45282, 45282, 52831, 52831, 52831, 52831, 52831, 52831, 52831, 52831, 53591, 53591, 53591, 53591, 53591, 53591, 53591, 53591, 56427, 56427, 56427, 56427, 59706, 59706, 59706, 59706, 59706, 59706, 59706, 59706, 62482, 62482, 62482, 62482, 62482, 62482, 67038, 67038, 67038, 67038, 71192, 71192, 71192, 71192, 71192, 77957, 77957, 77957, 77957, 77957, 77957, 103733, 103733, 103733, 103733, 103733, 117841, 117841, 117841, 117841, 117841, 117841, 117841, 117841, 117841, 117841, 123808, 123808, 123808, 123808, 123808, 123808, 123808, 123808, 123808, 132757, 132757, 132757, 132757, 132757, 132757, 132757, 132757, 132757, 132757, 132757, 132757, 132757, 132757, 144749, 144749, 144749, 144749, 144749, 144749, 144749, 170830, 170830, 170830, 170830, 170830, 170830, 177282, 177282, 177282, 177282, 177282, 177282, 177432, 177432, 177432, 177432, 177432, 177432, 177432, 184526, 184526, 184526, 184526, 184526, 191825, 191825, 191825, 191825, 191825, 215668, 215668, 215668, 221085, 221085, 221085, 221085, 221085, 231604, 231604, 231604, 231604, 231604, 231604, 239014, 239014, 239014, 239014, 239014, 242693, 242693, 242693, 242693, 242693, 257621, 257621, 257621, 257621, 261587, 261587, 261587, 261587, 261587, 261587, 261657, 261657, 261657, 261657, 261657, 261657)
ipTo <- c(190, 1366, 2679, 4023, 1625, 1627, 7529, 1272, 1628, 1630, 1631, 11124, 15360, 20175, 10626, 20970, 10776, 11164, 11166, 5955, 8719, 11164, 23842, 23843, 24115, 15312, 23329, 32127, 80473, 44848, 44849, 44850, 38133, 31084, 33711, 10920, 20178, 20179, 87093, 2134, 2136, 4119, 9995, 36524, 64698, 64700, 52833, 120083, 120085, 120086, 36689, 12340, 113789, 32094, 51015, 1898, 10076, 15800, 61488, 63836, 63837, 63838, 8882, 59708, 59711, 26982, 59708, 69497, 69498, 69499, 69500, 23349, 62480, 58926, 58928, 64118, 52271, 71190, 71380, 75384, 9762, 57876, 43543, 43546, 98488, 77951, 77953, 116842, 103732, 103734, 103735, 103728, 124733, 117842, 117843, 117845, 117842, 117843, 117845, 117842, 117842, 117843, 117845, 59267, 89503, 89506, 156, 190, 105428, 184973, 195785, 195787, 132753, 132754, 132755, 52563, 132755, 132756, 132759, 132762, 126757, 132754, 132755, 132756, 189269, 265886, 43155, 80519, 159667, 82479, 152760, 136747, 65216, 114684, 114686, 114687, 117132, 132667, 81755, 109198, 109199, 109202, 144124, 75023, 216449, 139527, 149146, 152038, 177428, 177430, 177428, 177430, 56930, 61658, 207112, 250755, 250756, 56930, 141148, 191036, 147084, 245110, 175959, 177376, 177377, 88463, 103641, 115111, 165118, 228427, 43553, 76706, 78278, 131353, 75725, 119146, 12615, 15740, 229533, 151325, 237568, 239545, 239546, 239547, 110872, 215593, 60310, 60312, 133398, 44502, 261582, 261590, 261599, 271593, 261584, 261588, 261649, 261653, 261654, 261658, 261662, 105814)
ipGroupFrom <- factor(c('DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'Video', 'Video', 'Video', 'Video', 'Video', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'Video', 'Video', 'Video', 'Video', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD', 'DVD'), levels=c("DVD", "Video"))
ipSRFrom <- c(30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 37, 37, 37, 37, 37, 37, 26, 26, 26, 26, 14, 14, 14, 14, 14, 14, 14, 14, 16, 16, 16, 16, 16, 16, 16, 16, 10, 10, 10, 10, 1, 1, 1, 1, 1, 1, 1, 1, 19, 19, 19, 19, 19, 19, 10, 10, 10, 10, 5, 5, 5, 5, 5, 3, 3, 3, 3, 3, 3, 17, 17, 17, 17, 17, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 27, 27, 27, 27, 27, 27, 27, 10, 10, 10, 10, 10, 10, 6, 6, 6, 6, 6, 6, 19, 19, 19, 19, 19, 19, 19, 25, 25, 25, 25, 25, 3, 3, 3, 3, 3, 8, 8, 8, 27, 27, 27, 27, 27, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 26, 26, 26, 26, 26, 15, 15, 15, 15, 8, 8, 8, 8, 8, 8, 26, 26, 26, 26, 26, 26)
ipSRTo <- c(5, 2, 18, 20, 12, 6, 8, 14, 16, 4, 18, 20, 3, 6, 14, 5, 3, 3, 4, 3, 13, 3, 5, 9, 18, 17, 8, 2, 8, 9, 16, 9, 24, 11, 25, 6, 9, 3, 21, 1, 5, 2, 24, 2, 6, 6, 8, 18, 7, 4, 20, 6, 22, 13, 10, 19, 4, 22, 7, 7, 9, 7, 11, 21, 12, 17, 21, 5, 7, 2, 1, 26, 6, 14, 2, 17, 4, 13, 12, 6, 8, 13, 4, 7, 1, 7, 9, 15, 19, 6, 20, 0, 19, 14, 18, 11, 14, 18, 11, 14, 14, 18, 11, 16, 1, 5, 3, 5, 6, 22, 5, 20, 10, 29, 9, 22, 9, 12, 10, 9, 12, 29, 9, 12, 13, 6, 23, 6, 18, 10, 18, 6, 9, 11, 8, 8, 19, 12, 10, 9, 8, 14, 1, 7, 10, 13, 18, 6, 6, 4, 6, 4, 4, 22, 5, 8, 4, 4, 13, 11, 3, 4, 21, 22, 8, 18, 1, 6, 5, 5, 4, 8, 6, 12, 6, 3, 13, 8, 10, 1, 1, 22, 12, 18, 19, 5, 18, 31, 8, 13, 10, 14, 25, 4, 19, 17, 5, 21, 3, 1, 19, 10)
ipTRFrom <- c(290, 290, 290, 290, 290, 290, 290, 290, 290, 290, 290, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 73, 73, 73, 73, 73, 73, 73, 294, 294, 294, 294, 294, 294, 43, 43, 43, 43, 5, 5, 5, 5, 5, 5, 5, 5, 13, 13, 13, 13, 13, 13, 13, 13, 28, 28, 28, 28, 1, 1, 1, 1, 1, 1, 1, 1, 110, 110, 110, 110, 110, 110, 7, 7, 7, 7, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 25, 25, 25, 25, 25, 25, 25, 2, 2, 2, 2, 2, 2, 12, 12, 12, 12, 12, 12, 111, 111, 111, 111, 111, 111, 111, 294, 294, 294, 294, 294, 0, 0, 0, 0, 0, 0, 0, 0, 243, 243, 243, 243, 243, 43, 43, 43, 43, 43, 43, 15, 15, 15, 15, 15, 483, 483, 483, 483, 483, 1, 1, 1, 1, 12, 12, 12, 12, 12, 12, 2, 2, 2, 2, 2, 2)
ipTRTo <- c(19, 2, 22, 105, 22, 1, 6, 55, 40, 21, 47, 13, 0, 42, 14, 51, 2, 4, 0, 2, 41, 4, 0, 19, 21, 63, 5, 0, 2, 4, 63, 63, 7, 1, 8, 11, 134, 134, 12, 5, 10, 3, 58, 1, 6, 2, 27, 39, 2, 18, 87, 12, 218, 2, 30, 17, 0, 41, 13, 9, 3, 2, 13, 8, 10, 1, 8, 1, 0, 7, 1, 167, 63, 28, 0, 6, 1, 10, 4, 0, 2, 0, 5, 2, 3, 2, 2, 12, 24, 45, 21, 0, 8, 2, 21, 20, 2, 21, 20, 2, 2, 21, 20, 14, 6, 6, 3, 19, 13, 88, 4, 9, 6, 0, 19, 54, 19, 6, 9, 1, 2, 0, 19, 6, 3, 13, 46, 29, 6, 1, 15, 1, 4, 18, 28, 5, 15, 21, 10, 12, 3, 5, 4, 3, 8, 5, 0, 0, 5, 0, 5, 0, 1, 221, 1, 13, 3, 1, 7, 40, 5, 0, 8, 37, 67, 48, 0, 6, 1, 25, 1, 69, 0, 55, 3, 0, 5, 5, 2, 13, 0, 44, 53, 9, 4, 5, 13, 212, 3, 3, 1, 3, 8, 0, 3, 12, 11, 10, 5, 0, 49, 42)
ipTitleFrom <- c(16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 13, 13, 13, 13, 13, 13, 13, 30, 30, 30, 30, 30, 30, 11, 11, 11, 11, 26, 26, 26, 26, 26, 26, 26, 26, 5, 5, 5, 5, 5, 5, 5, 5, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 18, 18, 18, 18, 18, 18, 25, 25, 25, 25, 14, 14, 14, 14, 14, 12, 12, 12, 12, 12, 12, 21, 21, 21, 21, 21, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27, 27, 27, 27, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 29, 29, 29, 29, 29, 29, 29, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 15, 15, 15, 15, 15, 15, 15, 30, 30, 30, 30, 30, 7, 7, 7, 7, 7, 6, 6, 6, 9, 9, 9, 9, 9, 17, 17, 17, 17, 17, 17, 10, 10, 10, 10, 10, 24, 24, 24, 24, 24, 19, 19, 19, 19, 3, 3, 3, 3, 3, 3, 28, 28, 28, 28, 28, 28)
ipNames <- c('Attraction', 'Barbara The Fair With The Silken Hair', 'Cannibal Apocalypse', "DJ Qbert's Wave Twisters", 'David and Lisa', 'Def Comedy Jam Vol. 13', 'Detroit Lions 2001 NFL Team Video', 'Donnie McClurkin: Live in London and More', 'El Hombre Sin Sombra (Hollow Man)', 'Gladiator', 'Kindergarten Cop', "Kingsley's Meadow - Wise Guy", "Lady & The Tramp II - Scamp's Adventure", 'Lojong - Transforming the Mind (Boxed Set)', 'Menace II Society', 'Merlin', 'Modern Times', 'Murder by Numbers (Full Screen Edition)', 'Nancy Drew: A Haunting We Will Go', 'Princess Nine - Triple Play (Vol. 3)', 'Secret Agent AKA Danger Man Set 2', 'Seguire Tus Pasos', 'Selena Remembered', 'Seven (New Line Platinum Series)', 'Sheba Baby', 'Slaughter', 'The Complete Guide to Medicine Ball Training', 'The Gambler', 'The Getaway', 'The Sum of All Fears')
ip_df <- data.frame(X=1:202,
from=ipFrom,
to=ipTo,
salesrank.from=ipSRFrom,
salesrank.to=ipSRTo,
totalreviews.from=ipTRFrom,
totalreviews.to=ipTRTo,
group.from=ipGroupFrom,
title.from=factor(ipNames[ipTitleFrom], levels=ipNames)
)
# Create a new graph
ip_g <- graph_from_data_frame(ip_df %>% select(from, to), directed = TRUE)
# Add color to the edges based on sales rank, blue is higer to lower, red is lower to higher
E(ip_g)$rank_flag <- ifelse(ip_df$salesrank.from <= ip_df$salesrank.to, "blue", "red")
# Plot and add a legend
plot(ip_g, vertex.label = NA, edge.arrow.width = 1, edge.arrow.size = 0,
edge.width = 4, margin = 0, vertex.size = 4,
edge.color = E(ip_g)$rank_flag, vertex.color = "black" )
legend("bottomleft", legend = c("Lower to Higher Rank", "Higher to Lower Rank"),
fill = unique(E(ip_g)$rank_flag ), cex = .7)
# Get a count of out degrees for all vertices
# deg_ct <- lapply(time_graph, function(x){return(degree(x, mode = "out") )})
# Create a dataframe starting by adding the degree count
# deg_df <- data.frame(ct = unlist(deg_ct))
# Add a column with the vertex names
# deg_df$vertex_name <- names(unlist(deg_ct))
# Add a time stamp
# deg_df$date <- ymd(rep(d, unlist(lapply(time_graph, function(x){length(V(x))}))))
# See all the vertices that have more than three out degrees
# lapply(time_graph, function(x){return(V(x)[degree(x, mode = "out") > 3])})
# Create a dataframe to plot of three important vertices
# vert_df <- deg_df %>% filter(vertex_name %in% c(1629, 132757, 117841))
# Draw the plot to see how they change through time
# ggplot(vert_df, aes(x = date, y = ct, group = vertex_name, colour = vertex_name)) + geom_path()
# Calculate clustering and reciprocity metrics
# trans <- unlist(lapply(all_graphs, FUN=transitivity))
# rp <- unlist(lapply(all_graphs, FUN=reciprocity))
# Create daaframe for plotting
# met_df <- data.frame("metric" = c(trans, rp))
# Repeat the data
# met_df$date <- rep(ymd(d), 2)
# Sort and then Repeat the metric labels
# met_df$name <- sort(rep(c("clustering", "reciprocity"), 4))
# Plot
# ggplot(met_df, aes(x= date, y= metric, group = name, colour = name)) + geom_path()
Chapter 2 - Talk About R on Twitter
Creating retweet graphs:
if(!rt_name %in% all_sn){ rt_g <- rt_g + vertices(rt_name) } Building mentions graphs:
for(j in ment_name) { if(!j %in% all_sn) { ment_g <- ment_g + vertices(j) } ment_g <- ment_g + edges(c(raw_tweets$screen_name[i], j)) } Finding communities:
Example code includes:
rt_g <- read.graph("./RInputFiles/rt_g.gml", format=c("gml"))
rt_g
## IGRAPH 096a106 DN-- 4118 6052 --
## + attr: id (v/n), name (v/c)
## + edges from 096a106 (vertex names):
## [1] thinkR_fr ->thw_ch thinkR_fr ->omarwagih
## [3] KJMillidine ->Rbloggers earino ->d4tagirl
## [5] ReecheshJC ->KirkDBorne SCMansbridge ->rstudiotips
## [7] DeepSingularity->gp_pulipaka chrisderv ->thinkR_fr
## [9] chrisderv ->_ColinFay chrisderv ->joshua_ulrich
## [11] mtrost2 ->romain_francois mtrost2 ->rstudiotips
## [13] mtrost2 ->RLangTip dani_sola ->rstudiotips
## [15] hrhotz ->rstudiotips hrhotz ->cboettig
## + ... omitted several edges
# Calculate the number of nodes
gsize(rt_g)
## [1] 6052
# Calculate the number of edges
gorder(rt_g)
## [1] 4118
# Calculate the density
graph.density(rt_g)
## [1] 0.00035697
# Create the plot
plot(rt_g, vertex.label = NA, edge.arrow.width = .8, edge.arrow.size = 0.4, vertex.size = 3)
# Set the default color to black
V(rt_g)$color <- "black"
# Set the color of nodes that were retweeted just once to blue
V(rt_g)[degree(rt_g, mode = "in") == 1]$color <- "blue"
# Set the color of nodes that were retweeters just once to green
V(rt_g)[degree(rt_g, mode = "out") == 1 ]$color <- "green"
# Plot the network
plot(rt_g, vertex.label = NA, edge.arrow.width = .8,
edge.arrow.size = 0.25,
vertex.size = 4, vertex.color = V(rt_g)$color)
# Set the default color to black
V(rt_g)$color <- "black"
# Set the color of nodes that were retweeted just once to blue
V(rt_g)[degree(rt_g, mode = "in") == 1 & degree(rt_g, mode = "out") == 0]$color <- "blue"
# Set the color of nodes that were retweeters just once to green
V(rt_g)[degree(rt_g, mode = "in") == 0 & degree(rt_g, mode = "out") == 1 ]$color <- "green"
# Plot the network
plot(rt_g, vertex.label = NA, edge.arrow.width = .8,
edge.arrow.size = 0.25,
vertex.size = 4, vertex.color = V(rt_g)$color)
# Calculate betweenness
rt_btw <- igraph::betweenness(rt_g, directed = TRUE)
# Plot histogram
hist(rt_btw, breaks = 2000, xlim = c(0, 1000), main = "Betweenness")
# Calculate eigen centrality
rt_ec <- eigen_centrality(rt_g, directed = TRUE)
# Plot histogram
hist(rt_ec$vector, breaks = 100, xlim = c(0, .2), main = "Eigen Centrality")
# Get top 1% of vertices by eigen centrality
top_ec <- rt_ec$vector[rt_ec$vector > quantile(rt_ec$vector, .99)]
# Get top 1% of vertices by betweenness
top_btw <- rt_btw[rt_btw > quantile(rt_btw, .99)]
# Make a nice data frame to print, with three columns, Rank, Betweenness, and Eigencentrality
most_central <- as.data.frame(cbind(1:length(top_ec), names(sort(top_btw, decreasing = T)),
names(sort(top_ec, decreasing = T))
)
)
# Set column names
colnames(most_central) <- c("Rank", "Betweenness", "Eigen Centrality")
# Print out the data frame
print(most_central)
## Rank Betweenness Eigen Centrality
## 1 1 hadleywickham ma_salmon
## 2 2 kierisi rstudiotips
## 3 3 drob opencpu
## 4 4 opencpu AchimZeileis
## 5 5 ma_salmon dataandme
## 6 6 rmflight drob
## 7 7 dataandme _ColinFay
## 8 8 _ColinFay rOpenSci
## 9 9 juliasilge kearneymw
## 10 10 revodavid RobertMylesMc
## 11 11 rOpenSci ptrckprry
## 12 12 nj_tierney rmflight
## 13 13 jonmcalder thosjleeper
## 14 14 Md_Harris revodavid
## 15 15 mauro_lepore juliasilge
## 16 16 sckottie RLadiesGlobal
## 17 17 RLadiesGlobal hadleywickham
## 18 18 kearneymw mauro_lepore
## 19 19 lenkiefer JennyBryan
## 20 20 NumFOCUS tudosgar
## 21 21 tjmahr cboettig
## 22 22 TheRealEveret antuki13
## 23 23 RLadiesMAD jasdumas
## 24 24 jasdumas Rbloggers
## 25 25 JennyBryan rensa_co
## 26 26 hrbrmstr timtrice
## 27 27 antuki13 daattali
## 28 28 Voovarb johnlray
## 29 29 timtrice joranelias
## 30 30 thinkR_fr StatsbyLopez
## 31 31 benmarwick kierisi
## 32 32 RosanaFerrero joshua_ulrich
## 33 33 clquezadar thinkR_fr
## 34 34 drsimonj ledell
## 35 35 zentree pssGuy
## 36 36 thomasp85 bastistician
## 37 37 OilGains zentree
## 38 38 yodacomplex brookLYNevery1
## 39 39 annakrystalli Md_Harris
## 40 40 davidhughjones sckottie
## 41 41 noamross jonmcalder
## 42 42 AlexaLFH nj_tierney
# Transform rt_btw and add as centrality
V(rt_g)$cent <- log(rt_btw+2)
# Visualize
plot(rt_g, vertex.label = NA, edge.arrow.width = .2,
edge.arrow.size = 0.0,
vertex.size = unlist(V(rt_g)$cent), vertex.color = "red")
# Create subgraph
rt_sub <-induced_subgraph(rt_g, V(rt_g)[V(rt_g)$cent >= quantile(V(rt_g)$cent, 0.99 )])
# Plot subgraph
plot(rt_sub, vertex.label = NA, edge.arrow.width = .2,
edge.arrow.size = 0.0,
vertex.size = unlist(V(rt_sub)$cent), vertex.color = "red")
ment_g <- read.graph("./RInputFiles/ment_g.gml", format=c("gml"))
ment_g
## IGRAPH 1c93213 DN-- 955 975 --
## + attr: id (v/n), name (v/c)
## + edges from 1c93213 (vertex names):
## [1] thinkR_fr ->ma_salmon thinkR_fr ->rstudio
## [3] thinkR_fr ->rforjournalists thinkR_fr ->aschinchon
## [5] thinkR_fr ->zedsamurai thinkR_fr ->ikashnitsky
## [7] thinkR_fr ->NSSDeviations thinkR_fr ->BeginTry
## [9] chrisderv ->pbaumgartner njogukennly->rstudio
## [11] ma_salmon ->rOpenSci ma_salmon ->RLadiesDC
## [13] ma_salmon ->marvin_dpr ma_salmon ->drob
## [15] ma_salmon ->kearneymw ma_salmon ->LucyStats
## + ... omitted several edges
rt_ratio <- degree(rt_g, mode="in") / (degree(rt_g, mode="out"))
ment_ratio <- degree(ment_g, mode="in") / (degree(ment_g, mode="out"))
# Create a dataframe to plot with ggplot
ratio_df <- data.frame(io_ratio = c(ment_ratio, rt_ratio))
ratio_df["graph_type"] <- c(rep("Mention", length(ment_ratio)), rep("Retweet", length(rt_ratio)) )
ratio_df_filtered <- ratio_df %>% filter(!is.infinite(io_ratio) & io_ratio > 0)
# Plot the graph
ggplot(ratio_df, aes(x = io_ratio , fill= graph_type, group = graph_type)) +
geom_density(alpha = .5) +
xlim(0, 10)
## Warning: Removed 891 rows containing non-finite values (stat_density).
# Check the mean and median of each ratio
ratio_df %>% group_by(graph_type) %>% summarise(m_ratio = mean(io_ratio))
## # A tibble: 2 x 2
## graph_type m_ratio
## <chr> <dbl>
## 1 Mention Inf
## 2 Retweet Inf
ratio_df %>% group_by(graph_type) %>% summarise(med = median(io_ratio))
## # A tibble: 2 x 2
## graph_type med
## <chr> <dbl>
## 1 Mention Inf
## 2 Retweet 0
ratio_df %>% filter(io_ratio != +Inf) %>% group_by(graph_type) %>% summarise(m_ratio = mean(io_ratio))
## # A tibble: 2 x 2
## graph_type m_ratio
## <chr> <dbl>
## 1 Mention 0.294
## 2 Retweet 0.268
ratio_df %>% filter(io_ratio != +Inf) %>% group_by(graph_type) %>% summarise(med = median(io_ratio))
## # A tibble: 2 x 2
## graph_type med
## <chr> <dbl>
## 1 Mention 0
## 2 Retweet 0
# Plot mention graph
plot(ment_g, vertex.label = NA, edge.arrow.width = .8,
edge.arrow.size = 0.2,
margin = 0,
vertex.size = 3)
# Find the assortivity of each graph
assortativity_degree(rt_g, directed = TRUE)
## [1] -0.1502212
assortativity_degree(ment_g, directed = TRUE)
## [1] -0.07742748
# Find the reciprocity of each graph
reciprocity(rt_g)
## [1] 0.005948447
reciprocity(ment_g)
## [1] 0.01846154
# Get size 3 cliques
clq_list <- cliques(ment_g, min = 3, max = 3)
## Warning in cliques(ment_g, min = 3, max = 3): At igraph_cliquer.c:56 :Edge
## directions are ignored for clique calculations
# Convert to a dataframe and filter down to just revodavid cliques
clq_df <- data.frame(matrix(names(unlist(clq_list)), nrow = length(clq_list), byrow = T))
rev_d <- clq_df %>% filter(X1 == "revodavid" | X2 == "revodavid" | X3 == "revodavid") %>% droplevels()
# Create empty graph and build it up
clq_g_empty <- graph.empty()
clq_g <- clq_g_empty + vertices(unique(unlist(rev_d)))
for(i in 1:dim(rev_d)[1]){
clq_g <- clq_g + edges(rev_d[i, 1], rev_d[i, 2])
clq_g <- clq_g + edges(rev_d[i, 2], rev_d[i, 3])
clq_g <- clq_g + edges(rev_d[i, 1], rev_d[i, 3])}
# Trim graph and plot using `simplify()`
clq_g_trimmed <- as.undirected(simplify(clq_g))
plot(clq_g_trimmed)
# Find the communities
rt_fgc <- cluster_fast_greedy(as.undirected(rt_g))
rt_info <- cluster_infomap(as.undirected(rt_g))
rt_clust <- cluster_louvain(as.undirected(rt_g))
# Compare all the communities
compare(rt_fgc, rt_clust, method = 'vi')
## [1] 2.144703
compare(rt_info, rt_clust, method = 'vi')
## [1] 1.623552
compare(rt_fgc, rt_info, method = 'vi')
## [1] 2.324274
# Test membership of the same users
fgc_test <- which(names(membership(rt_fgc)) %in% c("bass_analytics", "big_data_flow"))
membership(rt_fgc)[fgc_test]
## bass_analytics big_data_flow
## 3 3
info_test <- which(names(membership(rt_info)) %in% c("bass_analytics", "big_data_flow"))
membership(rt_info)[info_test]
## bass_analytics big_data_flow
## 102 77
# The crossing() function in igraph will return true if a particular edge crosses communities
# This is useful when we want to see certain vertices that are bridges between communities
# Assign cluster membership to each vertex in rt_g using membership()
V(rt_g)$clust <- membership(rt_clust)
# Assign crossing value to each edge
E(rt_g)$cross <- crossing(rt_clust, rt_g)
# Plot the whole graph (this is probably a mess)
plot(rt_g, vertex.label = NA, edge.arrow.width = .8, edge.arrow.size = 0.2,
coords = layout_with_fr(rt_g), margin = 0, vertex.size = 3,
vertex.color = V(rt_g)$clust, edge.color = E(rt_g)$cross+1)
# Create a subgraph with just a few communities greater than 50 but less than 90 in size
mid_comm <- as.numeric(names(sizes(rt_clust)[sizes(rt_clust) > 50 & sizes(rt_clust) < 90 ]))
rt_sg <- induced.subgraph(rt_g, V(rt_g)[ clust %in% mid_comm ])
# Plot the subgraph
plot(rt_sg, vertex.label = NA, edge.arrow.width = .8, edge.arrow.size = 0.2,
coords = layout_with_fr(rt_sg), margin = 0, vertex.size = 3,
vertex.color = V(rt_sg)$clust, edge.color = E(rt_sg)$cross+1)
Chapter 3 - Bike Sharing in Chicago
Creating our graph from raw data:
Compare Graph Distance vs. Geographic Distance:
Connectivity:
Example code includes:
bike_dat <- readr::read_csv("./RInputFiles/divvy_bike_sample.csv")
## Parsed with column specification:
## cols(
## tripduration = col_integer(),
## from_station_id = col_integer(),
## from_station_name = col_character(),
## to_station_id = col_integer(),
## to_station_name = col_character(),
## usertype = col_character(),
## gender = col_character(),
## birthyear = col_integer(),
## from_latitude = col_double(),
## from_longitude = col_double(),
## to_latitude = col_double(),
## to_longitude = col_double(),
## geo_distance = col_double()
## )
glimpse(bike_dat)
## Observations: 52,800
## Variables: 13
## $ tripduration <int> 295, 533, 1570, 2064, 2257, 296, 412, 948, 8...
## $ from_station_id <int> 49, 165, 25, 300, 85, 174, 75, 45, 85, 99, 3...
## $ from_station_name <chr> "Dearborn St & Monroe St", "Clark St & Wavel...
## $ to_station_id <int> 174, 308, 287, 296, 313, 198, 56, 147, 174, ...
## $ to_station_name <chr> "Canal St & Madison St", "Seeley Ave & Rosco...
## $ usertype <chr> "Subscriber", "Subscriber", "Customer", "Cus...
## $ gender <chr> "Male", "Male", NA, NA, "Male", "Female", "M...
## $ birthyear <int> 1964, 1972, NA, NA, 1963, 1973, 1989, 1965, ...
## $ from_latitude <dbl> 41.88132, 41.95078, 41.89766, 41.93773, 41.9...
## $ from_longitude <dbl> -87.62952, -87.65917, -87.62351, -87.64409, ...
## $ to_latitude <dbl> 41.88209, 41.94340, 41.88032, 41.94011, 41.9...
## $ to_longitude <dbl> -87.63983, -87.67962, -87.63519, -87.64545, ...
## $ geo_distance <dbl> 858.9672, 1881.5034, 2159.4804, 287.8546, 30...
# Create trip_df_subs
trip_df_subs <- bike_dat %>%
filter(usertype == "Subscriber") %>%
group_by(from_station_id, to_station_id) %>%
summarise(weights = n())
# Create igraph object
trip_g_subs <- graph_from_data_frame(trip_df_subs[, 1:2])
# Add edge weights
E(trip_g_subs)$weights <- trip_df_subs$weights / sum(trip_df_subs$weights)
# Now work the same code and filter it down to non-subs
trip_df_non_subs <- bike_dat %>%
filter(usertype == "Customer") %>%
group_by(from_station_id, to_station_id) %>%
summarise(weights = n())
# Create igraph object
trip_g_non_subs <- graph_from_data_frame(trip_df_non_subs[, 1:2])
# Add edge weights
E(trip_g_non_subs)$weights <- trip_df_non_subs$weights / sum(trip_df_non_subs$weights)
# Now let's compare these graphs
gsize(trip_g_subs)
## [1] 14679
gsize(trip_g_non_subs)
## [1] 9528
# Create the subgraphs
sg_sub <- induced_subgraph(trip_g_subs, 1:12)
sg_non_sub <- induced_subgraph(trip_g_non_subs, 1:12)
# Plot sg_sub
plot(sg_sub, vertex.size = 20, edge.arrow.width = .8, edge.arrow.size = 0.4,
margin = 0, edge.width = E(sg_sub)$weights*10000, main = "Subscribers")
# Plot sg_non_sub
plot(sg_non_sub, vertex.size = 20, edge.arrow.width = .8, edge.arrow.size = 0.4,
margin = 0, vertex.size = 10, edge.width = E(sg_non_sub)$weights*10000,
main = "Customers")
bike_dist <- function(station_1, station_2, divy_bike_df){
st1 <- divy_bike_df %>% filter(from_station_id == station_1 ) %>% sample_n(1) %>% select(from_longitude, from_latitude)
st2 <- divy_bike_df %>% filter(from_station_id == station_2 ) %>% sample_n(1) %>% select(from_longitude, from_latitude)
farthest_dist <- geosphere::distm(st1, st2, fun = geosphere::distHaversine)
return(farthest_dist)
}
# See the diameter of each graph
get_diameter(trip_g_subs)
## + 7/300 vertices, named, from f079f82:
## [1] 200 336 267 150 45 31 298
get_diameter(trip_g_non_subs)
## + 6/299 vertices, named, from f07dc8c:
## [1] 116 31 25 137 135 281
# Find the farthest vertices
farthest_vertices(trip_g_subs)
## $vertices
## + 2/300 vertices, named, from f079f82:
## [1] 200 298
##
## $distance
## [1] 6
farthest_vertices(trip_g_non_subs)
## $vertices
## + 2/299 vertices, named, from f07dc8c:
## [1] 116 281
##
## $distance
## [1] 5
# See how far apart each one is and compare the distances
bike_dist(200, 298, bike_dat)
## [,1]
## [1,] 17078.31
bike_dist(116, 281, bike_dat)
## [,1]
## [1,] 7465.656
# Create trip_df
trip_df <- bike_dat %>%
group_by(from_station_id, to_station_id) %>%
summarise(weights = n())
# Create igraph object
trip_g_df <- graph_from_data_frame(trip_df[, 1:2])
# Add edge weights
E(trip_g_df)$weights <- trip_df$weights / sum(trip_df$weights)
trip_g_simp <- simplify(trip_g_df, remove.multiple=FALSE)
trip_g_simp
## IGRAPH f0beca5 DN-- 300 18773 --
## + attr: name (v/c), weights (e/n)
## + edges from f0beca5 (vertex names):
## [1] 5->14 5->16 5->25 5->29 5->33 5->35 5->36 5->37 5->43 5->49
## [11] 5->51 5->52 5->53 5->55 5->59 5->66 5->68 5->72 5->74 5->75
## [21] 5->76 5->81 5->85 5->90 5->92 5->97 5->98 5->99 5->100 5->108
## [31] 5->110 5->111 5->117 5->120 5->128 5->134 5->135 5->137 5->140 5->141
## [41] 5->144 5->146 5->148 5->149 5->168 5->169 5->171 5->174 5->175 5->176
## [51] 5->177 5->178 5->181 5->191 5->192 5->193 5->194 5->198 5->210 5->214
## [61] 5->218 5->227 5->233 5->237 5->255 5->264 5->268 5->273 5->277 5->291
## [71] 5->309 5->321 5->333 5->335 5->341
## + ... omitted several edges
# Find the degree distribution
trip_out <- degree(trip_g_simp, mode = "out")
trip_in <- degree(trip_g_simp, mode = "in")
# Create a data frame for easier filtering
trip_deg <- data.frame(cbind(trip_out, trip_in))
trip_deg$station_id <- names(trip_out)
trip_deg_adj <- trip_deg %>% mutate(ratio = trip_out / trip_in)
# Filter out rarely traveled to stations
trip_deg_filter <- trip_deg_adj %>% filter(trip_out > 10) %>% filter(trip_in > 10)
# Plot histogram
hist(trip_deg_filter$ratio)
# See which stations were the most skewed using which.min() and which.max()
trip_deg_filter %>% slice(which.min(ratio))
## trip_out trip_in station_id ratio
## 1 14 24 207 0.5833333
trip_deg_filter %>% slice(which.max(ratio))
## trip_out trip_in station_id ratio
## 1 19 11 135 1.727273
# If the weights are the same across all stations, then an unweighted degree ratio would work
# But if we want to know how many bikes are actually flowing, we need to consider weights
# The weighted analog to degree distribution is strength
# We can calculate this with the strength() function, which presents a weighted degree distribution based on the weight attribute of a graph's edges
# Calculate the weighted in and out degrees
trip_out_w <- strength(trip_g_simp, mode = "out")
trip_in_w <- strength(trip_g_simp, mode = "in")
# Create a data frame for easier filtering
trip_deg_w <- data.frame(cbind(trip_out_w, trip_in_w))
trip_deg_w$station_id <- names(trip_out_w)
trip_deg_w_adj <- trip_deg_w %>% mutate(ratio = trip_out_w / trip_in_w)
# Filter out rarely traveled to stations
trip_deg_w_filter <- trip_deg_w_adj %>% filter(trip_out_w > 10) %>% filter(trip_in_w > 10)
# Plot histogram of ratio
hist(trip_deg_w_filter$ratio)
# See which stations were the most skewed using which.min() and which.max()
trip_deg_w_filter %>% slice(which.min(ratio))
## trip_out_w trip_in_w station_id ratio
## 1 14 24 207 0.5833333
trip_deg_w_filter %>% slice(which.max(ratio))
## trip_out_w trip_in_w station_id ratio
## 1 19 11 135 1.727273
latlong <- data.frame(from_longitude=c(-87.656495, -87.660996, -87.6554864, -87.642746, -87.67328, -87.661535, -87.623727, -87.668745, -87.65103, -87.666507, -87.666611),
from_latitude=c(41.858166, 41.869417, 41.8694821, 41.880422, 41.87501, 41.857556, 41.864059, 41.857901, 41.871737, 41.865234, 41.891072)
)
# Create a sub graph of the least traveled graph 275
g275 <- make_ego_graph(trip_g_simp, 1, nodes = "275", mode= "out")[[1]]
# Plot graph with geographic coordinates
plot(g275, layout = as.matrix(latlong), vertex.label.color = "blue", vertex.label.cex = .6,
edge.color = 'black', vertex.size = 15, edge.arrow.size = .1,
edge.width = E(g275)$weight, main = "Lat/Lon Layout")
# Plot graph without geographic coordinates
plot(g275, vertex.label.color = "blue", vertex.label.cex = .6,
edge.color = 'black', vertex.size = 15, edge.arrow.size = .1,
edge.width = E(g275)$weight,
main = "Default Layout")
# Eigen centrality weighted
ec_weight <- eigen_centrality(trip_g_simp, directed = T, weights = NULL)
# Eigen centrality unweighted
ec_unweight <- eigen_centrality(trip_g_simp, directed = T, weights = NA)
# Closeness weighted
close_weight <- closeness(trip_g_simp, weights = NULL)
# Closeness unweighted
close_unweight <- closeness(trip_g_simp, weights = NA)
# Output nicely with cbind()
cbind(c(
names(V(trip_g_simp))[which.min(ec_weight$vector)],
names(V(trip_g_simp))[which.min(close_weight)],
names(V(trip_g_simp))[which.min(ec_unweight$vector)],
names(V(trip_g_simp))[which.min(close_unweight)]
), c("Weighted Eigen Centrality", "Weighted Closeness", "Unweighted Eigen Centrality", "Unweighted Closeness")
)
## [,1] [,2]
## [1,] "204" "Weighted Eigen Centrality"
## [2,] "336" "Weighted Closeness"
## [3,] "204" "Unweighted Eigen Centrality"
## [4,] "336" "Unweighted Closeness"
trip_g_ud <- as.undirected(trip_g_simp)
trip_g_ud
## IGRAPH f0f6a40 UN-- 300 12972 --
## + attr: name (v/c)
## + edges from f0f6a40 (vertex names):
## [1] 5 --14 14--15 5 --16 13--16 15--16 16--17 5 --19 14--19 15--19 5 --20
## [11] 13--20 16--20 17--20 14--21 16--21 17--21 19--21 14--22 15--22 19--22
## [21] 21--22 5 --23 17--23 20--23 5 --24 16--24 17--24 23--24 5 --25 13--25
## [31] 16--25 17--25 20--25 21--25 22--25 23--25 24--25 5 --26 17--26 19--26
## [41] 20--26 24--26 25--26 13--27 16--27 20--27 24--27 26--27 17--28 20--28
## [51] 5 --29 16--29 17--29 20--29 24--29 25--29 26--29 16--30 17--30 19--30
## [61] 20--30 22--30 29--30 13--31 14--31 16--31 17--31 20--31 21--31 22--31
## [71] 23--31 24--31 25--31 26--31 28--31 30--31 15--32 19--32 21--32 22--32
## + ... omitted several edges
# Find the minimum number of cuts using min_cut()
ud_cut <- min_cut(trip_g_ud, value.only = FALSE)
# Print the vertex with the minimum number of cuts
print(ud_cut$partition1)
## + 1/300 vertex, named, from f0f6a40:
## [1] 281
# Make an ego graph
g<- make_ego_graph(trip_g_ud, 1, nodes = "281")[[1]]
plot(g, edge.color = 'black', edge.arrow.size = .1)
# Print the value
print(ud_cut$value)
## [1] 5
# Print cut object
print(ud_cut$cut)
## + 5/12972 edges from f0f6a40 (vertex names):
## [1] 71 --281 135--281 167--281 203--281 281--305
far_stations <- c("231", "321")
close_stations <- c("231", "213")
# Compare the output of close and far vertices
stMincuts(trip_g_simp, far_stations[1], far_stations[2])$value
## [1] 54
stMincuts(trip_g_simp, close_stations[1], close_stations[2])$value
## [1] 49
# Find the actual value
clust_coef <- transitivity(trip_g_simp, type = "global")
# Get randomization parameters using gorder() and edge_density()
nv <- gorder(trip_g_simp)
ed <- edge_density(trip_g_simp)
# Create an empty vector to hold output of 300 simulations
graph_vec <- rep(NA, 300)
# Calculate clustering for random graphs
for(i in 1:300){
graph_vec[i]<- transitivity(erdos.renyi.game(nv, ed, "gnp", directed = T), type = "global")
}
# Plot a histogram of the simulated values
hist(graph_vec, xlim = c(.35, .6), main = "Unweighted clustering randomization")
# Add a line with the true value
abline(v = clust_coef, col = "red")
# Find the mean local weighted clustering coeffecient
m_clust <- mean(transitivity(trip_g_simp, type = "weighted"))
nv <- gorder(trip_g_simp)
ed <- edge_density(trip_g_simp)
graph_vec <- rep(NA, 100)
for(i in 1:100){
g_temp <- erdos.renyi.game(nv, ed, "gnp", directed = T)
# Sample existing weights and add them to the random graph
E(g_temp)$weight <- sample(x = E(trip_g_simp)$weights, size = gsize(g_temp), replace = TRUE)
graph_vec[i]<- mean(transitivity(g_temp, type = "weighted"))
}
# Plot a histogram of the simulated values
hist(graph_vec, xlim = c(.35, .7), main = "Unweighted clustering randomization")
# Add a line with the true value
abline(v = m_clust ,col = "red")
Chapter 4 - Other Ways to Visualize Graph Data
Other packages for plotting graphs:
Interactive visualizations:
Alternative visualizations:
Example code includes:
verts <- c(1185, 3246, 1684, 3634, 3870, 188, 2172, 3669, 2267, 1877, 3931, 1862, 2783, 2351, 423, 3692, 1010, 173, 1345, 3913, 3646, 2839, 2624, 4072, 2685, 2901, 2227, 2431, 1183, 602, 3937, 3688, 2823, 3250, 101, 1951, 3097, 884, 1299, 945, 583, 1691, 1687, 1504, 622, 566, 949, 1897, 1083, 3491, 187, 1799, 3249, 496, 2280, 840, 519, 3060, 4115, 1520, 2700, 385, 1558, 1113, 3303, 1818, 3283, 3291, 3218, 1781, 3055, 2547, 2874, 3, 1923, 890, 1536, 2477, 1422, 449, 984, 2697, 1686, 3181, 415, 1754, 3972, 3600, 3573, 706, 527, 2631, 1383, 2644, 1290, 756, 3147, 377, 4109, 2056, 2411, 1337, 1963, 3833, 1939, 4030, 4111, 2442, 1647, 590, 3749, 1208, 244, 3796, 2886, 570, 2199, 3818, 2342, 1618, 2591, 1279, 1230, 878, 1476, 3930, 616, 364, 567, 2753, 2470, 3554, 2683, 2938, 2077, 2629, 3273, 3131, 3900, 1749, 1240, 1629, 42, 731, 3350, 919, 950, 305, 976, 2906, 3363, 1974, 1539, 978, 441, 1546, 4110, 860, 1762, 864, 1989, 1401, 2572, 1482, 1406, 2110, 2926, 874, 1631, 1050, 2488, 726, 3408, 2946, 2636, 2437, 1468, 2089, 3447, 2292, 3308, 1231, 2788, 1043, 2339, 1893, 3935, 2220, 3589, 3544, 1077, 1263, 4114, 2434, 3679, 1831, 1596, 2585, 598, 2246, 936, 3770, 2355, 2017, 1576, 3445, 1425, 1128, 668, 674, 1884, 989, 845, 2634, 4068, 2736, 1374, 3922, 3202, 3583, 1102, 3746, 2838, 2674, 206, 3966, 1860, 2180, 2717, 3562, 2405, 1666, 2107, 228, 1014, 1543, 768, 3229, 594, 3117, 2121, 2568, 666, 2454, 1209, 2807, 1545, 3753, 3744, 2812, 995, 858, 2293, 1034, 2053, 3034, 650, 1562, 1821, 3351, 3572, 3402, 2600, 3663, 1991, 2222, 1296, 1338, 78, 1936, 3352, 25, 278, 632, 2962, 2826, 3734, 1792, 286, 2491, 2912, 4028, 1522, 863, 223, 1518, 249, 866, 210, 2567, 1140, 386, 276, 3368, 2885, 3122, 3754, 396, 379, 3051, 2996, 36, 2973, 4106, 2404, 1834, 3920, 32, 1724, 1876, 1484, 1769, 2715, 211, 1350, 3054, 3178, 904, 1346, 3256, 3243, 1124, 559, 2672, 394, 128, 3790, 133, 1283, 3468, 3934, 1085, 2794, 3157, 1190, 1864, 2638, 2426, 2435, 3696, 1567, 451, 1987, 850, 1836, 1397, 3710, 1465, 865, 2350, 515, 3645, 1940, 614, 2341, 3711, 2516, 3914, 1216, 3140, 541, 725, 3369, 1157, 1364, 2943, 3947, 67, 1525, 1812, 1582, 1285, 4117, 1705, 1999, 3608, 2899, 782, 1155, 3632, 2187, 2844, 1393, 2873, 2008, 3412, 692, 1053, 355, 785, 3643, 1105, 2706, 2927, 393, 893, 1007, 4021, 439, 3687, 3667, 510, 3365, 2141, 1469, 1671, 2623, 307, 1259, 2526, 1176, 3083, 798, 1845, 1023, 712, 3520, 1191, 1771, 104, 2025, 2382, 2204, 3784, 3292, 2313, 1119, 1433, 593, 3182, 3516, 2079, 1215, 3673, 3831, 2257, 399, 1793, 366, 3690, 1041, 2147, 2690, 609, 3184, 2603, 2793, 540, 1315, 2471, 1922, 3792, 882, 214, 867, 3261, 3816, 2737, 3990, 457, 3566, 1595, 1697, 605, 2138, 990, 841, 2524, 1033, 2958, 343, 2998, 1559, 2756, 2414, 1620, 2285, 2, 791, 2566, 783, 2961, 1120, 2500, 3390, 421, 464, 2463, 4056, 3029, 3525, 256, 1668, 2544, 316, 3598, 917, 180, 2485, 2848, 1280, 1326, 1039, 290, 1321, 644)
verts <- c(verts, 1937, 1820, 3733, 1232, 1677, 298, 3102, 1427, 2653, 619, 1639, 2774, 226, 2934, 1084, 1312, 1123, 135, 1865, 2440, 3245, 92, 3551, 1088, 3370, 2467, 1604, 2928, 142, 2648, 1250, 2970, 1918, 983, 2866, 328, 2976, 3653, 2692, 4099, 291, 3819, 2864, 1375, 1169, 732, 2031, 3166, 1888, 2092, 2372, 1887, 1816, 58, 170, 3306, 3903, 715, 2312, 2323, 1404, 3824, 1942, 3142, 1964, 3214, 2084, 1502, 3366, 2513, 1464, 66, 2007, 1735, 3109, 2876, 3021, 1301, 3089, 535, 996, 3916, 3451, 2057, 1858, 215, 3417, 424, 312, 3103, 1791, 1189, 3149, 113, 835, 2415, 794, 3636, 612, 2816, 514, 2889, 1162, 1313, 2210, 339, 3850, 3481, 2047, 2739, 3124, 2643, 3428, 155, 3161, 3027, 2711, 1317, 148, 1273, 956, 2969, 1265, 1063, 3899, 3945, 1597, 2543, 363, 767, 3322, 2618, 2850, 1454, 2066, 2778, 3534, 1339, 314, 2174, 2589, 297, 3932, 2132, 2612, 3180, 1649, 1966, 2552, 3581, 3148, 196, 1741, 1213, 2924, 3936, 406, 3631, 813, 259, 3230, 543, 2233, 599, 70, 1797, 3607, 975, 1448, 2022, 2777, 696, 1581, 1542, 2523, 2457, 2857, 3046, 3272, 1891, 3681, 586, 1644, 871, 137, 2176, 1849, 480, 972, 1996, 565, 330, 1466, 1217, 2888, 889, 80, 3487, 1143, 2157, 3594, 3747, 634, 1463, 2150, 1775, 2247, 2484, 1658, 1309, 24, 13, 3383, 367, 1423, 2439, 2522, 3637, 2064, 3639, 4046, 2078, 3676, 3506, 1413, 2964, 2192, 3130, 4078, 1069, 2720, 3344, 1090, 5, 3848, 501, 167, 3915, 3787, 4049, 3986, 233, 2343, 3196, 3918, 4063, 537, 242, 3809, 1648, 1662, 2986, 124, 685, 1726, 4087, 1932, 3999, 1910, 484, 489, 1382, 2289, 2189, 3067, 2722, 2262, 2702, 429, 839, 1109, 1361, 2123, 4058, 3959, 2735, 52, 2183, 2707, 1538, 678, 63, 943, 3047, 3108, 1806, 730, 1628, 2664, 1355, 345, 932, 1201, 861, 3861, 1214, 403, 156, 3429, 3210, 3355, 1583, 2479, 3508, 164, 2299, 3320, 2923, 2562, 460, 4013, 417, 1947, 1853, 2272, 1027, 1997, 3266, 2449, 250, 1486, 177, 1118, 3644, 14, 2538, 3836, 2368, 3349, 1879, 2310, 3413, 4032, 319, 3155, 2413, 3842, 3724, 1802, 3319, 2940, 31, 773, 426, 1067, 2374, 3240, 2335, 4010, 3398, 3096, 392, 245, 2898, 4026, 138, 2109, 1526, 2011, 881, 512, 372, 1650, 3373, 3659, 552, 2474, 1712, 3786, 2185, 43, 3406, 2890, 3504, 348, 2982, 2186, 481, 4018, 3048, 1360, 962, 838, 720, 1826, 4011, 2161, 1763, 2617, 2447, 65, 1227, 3938, 2569, 3662, 1746, 2742, 4020, 2148, 1643, 2450, 4093, 3905, 230, 3401, 168, 2779, 1847, 1006, 3074, 1894, 1702, 1229, 3704, 2586, 3595, 1163, 3661, 2230, 3236, 1111, 1770, 438, 2504, 2828, 651, 2456, 1900, 3050, 506, 1674, 3477, 2766, 76, 3606, 3630, 1237, 3617, 295, 3512, 1286, 3623, 3495, 964, 3407, 494, 3629, 140, 1178, 3045, 2041, 194, 3852, 3800, 1605, 1420, 1968, 442, 3570, 1796, 1729, 369, 2401, 1507, 2462, 145, 2580, 848, 4043, 3443, 2979, 22, 3727, 1316, 1437, 3450, 3590, 3465, 3188, 2373, 432, 3425, 3449, 1356, 273, 700, 1789, 1251, 1767, 3998, 2005, 1222, 2214, 340)
# Create subgraph of rt_g
rt_samp <- induced_subgraph(rt_g, verts)
# Convert from igraph using asNetwork()
net <- intergraph::asNetwork(rt_samp)
# Plot using igraph
plot(rt_samp, vertex.label = NA, edge.arrow.size = 0.2, edge.size = 0.5,
vertex.color = "black", vertex.size = 1
)
# Plot using ggnet2
GGally::ggnet2(net, node.size = 1, node.color = "black", edge.size = .4)
# Raw plot of rt_samp using ggnetwork()
library(ggnetwork)
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
ggplot(ggnetwork(rt_samp, arrow.gap = .01) , aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(arrow = arrow(length = unit(6, "pt"), type = "closed"), color = "black") +
geom_nodes(size = 4)
## Loading required package: sna
## Loading required package: statnet.common
##
## Attaching package: 'statnet.common'
## The following object is masked from 'package:base':
##
## order
## Loading required package: network
## network: Classes for Relational Data
## Version 1.13.0.1 created on 2015-08-31.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
## Mark S. Handcock, University of California -- Los Angeles
## David R. Hunter, Penn State University
## Martina Morris, University of Washington
## Skye Bender-deMoll, University of Washington
## For citation information, type citation("network").
## Type help("network-package") to get started.
##
## Attaching package: 'network'
## The following objects are masked from 'package:igraph':
##
## %c%, %s%, add.edges, add.vertices, delete.edges,
## delete.vertices, get.edge.attribute, get.edges,
## get.vertex.attribute, is.bipartite, is.directed,
## list.edge.attributes, list.vertex.attributes,
## set.edge.attribute, set.vertex.attribute
## sna: Tools for Social Network Analysis
## Version 2.4 created on 2016-07-23.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
## For citation information, type citation("sna").
## Type help(package="sna") to get started.
##
## Attaching package: 'sna'
## The following objects are masked from 'package:igraph':
##
## betweenness, bonpow, closeness, components, degree,
## dyad.census, evcent, hierarchy, is.connected, neighborhood,
## triad.census
# Prettier plot of rt_samp using ggnetwork()
ggplot(ggnetwork(rt_samp, arrow.gap = .01),aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(arrow = arrow(length = unit(6, "pt"), type = "closed"), color = "black", curvature = .2) +
geom_nodes(size = 4) + theme_blank()
# NEED TO FIX!
rt_keys <- sort(table(vertex_attr(rt_g)$clust), decreasing=TRUE)
# rt_drops <- names(rt_keys)[11:length(rt_keys)]
# vt_drops <- which(vertex_attr(rt_g)$clust %in% rt_drops)
# rt_use <- delete_vertices(rt_g, vt_drops)
rt_use <- induced_subgraph(rt_g, which(V(rt_g)$clust %in% names(rt_keys[1:10])))
# Convert to a network object
net <- intergraph::asNetwork(rt_use)
ggnet2(net, node.size = "cent", node.color = "clust", edge.size = .1,
color.legend = "Community Membership", color.palette = "Spectral"
)
# Now remove the centrality legend by setting size to false in the guide() function
ggnet2(net, node.size = "cent", node.color = "clust", edge.size = .1,
color.legend = "Community Membership", color.palette = "Spectral"
) +
guides( size = FALSE)
# Add edge colors
ggnet2(net, node.size = "cent", node.color = "clust", edge.size = .1,
color.legend = "Community Membership", color.palette = "Spectral",
edge.color = c("color", "gray88")) +
guides( size = FALSE)
# NEED TO CREATE rt_g_smaller!
# Basic plot where we set parameters for the plots using geom_edegs() and geom_nodes()
# ggplot(ggnetwork(rt_g_smaller, arrow.gap = .01), aes(x = x, y = y, xend = xend, yend = yend)) +
# geom_edges(arrow = arrow(length = unit(6, "pt"), type = "closed"), curvature = .2, color = "black") +
# geom_nodes(size = 4, aes(color = comm)) +
# theme_blank()
# Added guide legend, changed line colors, added size
# ggplot(ggnetwork(rt_g_smaller, arrow.gap = .01), aes(x = x, y = y, xend = xend, yend = yend)) +
# geom_edges(arrow = arrow(length = unit(6, "pt"), type = "closed"), curvature = .2, lwd = .3, aes(color=comm)) +
# geom_nodes(aes(color = comm, size = cent)) +
# theme_blank() +
# guides(color = guide_legend(title = "Community"), size = guide_legend(title = "Centrality"))
# NEED TO FIX!
# Add betweenness centrality using betweenness()
V(trip_g_simp)$cent <- igraph::betweenness(trip_g_simp)
# Create a ggplot object with ggnetwork to render using ggiraph
g <- ggplot(ggnetwork(trip_g_simp, arrow.gap = .01), aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(color = "black") +
geom_nodes(aes(size = cent)) +
theme_blank()
plot(g)
# Create ggiraph object and assign the tooltip to be interactive
my_gg <- g + ggiraph::geom_point_interactive(aes(tooltip = round(cent, 2),
data_id = round(cent, 2)
), size = 2
)
# Define some hover css so the cursor turns red
hover_css = "cursor:pointer;fill:red;stroke:red;r:3pt"
# ggiraph::ggiraph(code = print(my_gg), hover_css = hover_css, tooltip_offx = 10, tooltip_offy = -10)
# Add community membership as a vertex attribute using the cluster_walktrap algorithm
V(rt_g)$comm <- membership(cluster_walktrap(rt_g))
# Create an induced_subgraph
rt_sub_g <- induced_subgraph(rt_g, which(V(rt_g)$comm %in% 10:13))
# Plot to see what it looks like without an interactive plot using ggnetwork
ggplot(ggnetwork(rt_sub_g, arrow.gap = .01), aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(color = "black") +
geom_nodes(aes(color = as.factor(comm))) +
theme_blank()
# Convert to a networkD3 object
# nd3 <- igraph_to_networkD3(rt_sub_g)
# Assign grouping factor as community membership
# nd3$nodes$group = V(rt_sub_g)$comm
# Render your D3.js graph
# forceNetwork(Links = nd3$links, Nodes = nd3$nodes, Source = 'source',
# Target = 'target', NodeID = 'name', Group = 'group', legend = T, fontSize = 20
# )
# Convert trip_df to hive object using edge2HPD()
# bike_hive <- edge2HPD(edge_df = as.data.frame(trip_df))
# Assign to trip_df edgecolor using our custom function
# trip_df$edgecolor <- dist_gradient(trip_df$geodist)
# Calculate centrality with betweenness()
# bike_cent <- betweenness(trip_g)
# Add axis and radius based on longitude and radius
# bike_hive$nodes$radius<- ifelse(bike_cent > 0, bike_cent, runif(1000, 0, 3))
# Set axis as integers and axis colors to black
# bike_hive$nodes$axis <- as.integer(dist_stations$axis)
# bike_hive$axis.cols <- rep("black", 3)
# Set the edge colors to a heatmap based on trip_df$edgecolor
# bike_hive$edges$color <- trip_df$edgecolor
# plotHive(bike_hive, method = "norm", bkgnd = "white")
# Add community membership as a vertex attribute
V(rt_g)$comm <- membership(cluster_walktrap(rt_g))
# Create a subgraph
rt_sub_g <- induced_subgraph(rt_g, which(V(rt_g)$comm %in% 10:15))
# Plot to see what it looks like without an interactive plot
ggplot(ggnetwork(rt_sub_g, arrow.gap = .01), aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(color = "black") +
geom_nodes(aes(color = as.factor(comm)))+ theme_blank() +
theme(legend.position = "none")
# Make a Biofabric plot htmlwidget
# rt_bf <- bioFabric(rt_sub_g)
# bioFabric_htmlwidget(rt_bf)
# Create a dataframe of start and end latitude and longitude and add weights
# ll_to_plot <- bike_dat %>% group_by(from_station_id, to_station_id, from_latitude,
# from_longitude, to_latitude, to_longitude, usertype
# ) %>%
# summarise(weight = n())
# Create a base map with station points with ggmap()
# ggmap(chicago) +
# geom_segment(data = ll_to_plot, aes(x = from_longitude, y = from_latitude,
# xend = to_longitude, yend = to_latitude,
# colour = usertype, size = weight
# ), alpha = .5
# )
Chapter 1 - What is Bayesian Analysis?
Introduction:
Bayesian data analysis - named for Thomas Bayes from the early-mid 1700s:
Samples and posterior samples:
Chapter wrap-up:
Example code includes:
prop_model <- function(data = c(), prior_prop = c(1, 1), n_draws = 10000) {
data <- as.logical(data)
proportion_success <- c(0, seq(0, 1, length.out = 100), 1)
data_indices <- round(seq(0, length(data), length.out = min(length(data) + 1, 20)))
post_curves <- map_dfr(data_indices, function(i) {
value <- ifelse(i == 0, "Prior", ifelse(data[i], "Success", "Failure"))
label <- paste0("n=", i)
probability <- dbeta(proportion_success, prior_prop[1] + sum(data[seq_len(i)]),
prior_prop[2] + sum(!data[seq_len(i)])
)
probability <- probability / max(probability)
data_frame(value, label, proportion_success, probability)
}
)
post_curves$label <- fct_rev(factor(post_curves$label, levels = paste0("n=", data_indices )))
post_curves$value <- factor(post_curves$value, levels = c("Prior", "Success", "Failure"))
p <- ggplot(post_curves, aes(x = proportion_success, y = label, height = probability, fill = value)) +
ggridges::geom_density_ridges(stat="identity", color = "white",
alpha = 0.8, panel_scaling = TRUE, size = 1
) +
scale_y_discrete("", expand = c(0.01, 0)) +
scale_x_continuous("Underlying proportion of success") +
scale_fill_manual(values = hcl(120 * 2:0 + 15, 100, 65), name = "",
drop = FALSE, labels = c("Prior ", "Success ", "Failure ")
) +
#ggtitle(paste0("Binomial model - Data: ", sum(data), " successes, " , sum(!data), " failures")) +
theme_light(base_size = 18) +
theme(legend.position = "top")
print(p)
invisible(rbeta(n_draws, prior_prop[1] + sum(data), prior_prop[2] + sum(!data)))
}
# Define data and run prop_model
data = c(1, 0, 0, 1)
prop_model(data)
# Define data and run prop_model
data = c(1, 0, 0, 1)
prop_model(data)
data = c(1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0)
posterior <- prop_model(data)
head(posterior)
## [1] 0.1036182 0.1555942 0.1115889 0.1805051 0.2815857 0.3078648
hist(posterior, breaks = 30, xlim = c(0, 1), col = "palegreen4")
# Get some more information about posterior
median(posterior)
## [1] 0.1882898
quantile(posterior, c(0.05, 0.95))
## 5% 95%
## 0.06031634 0.38918378
sum(posterior > 0.07) / length(posterior)
## [1] 0.9263
Chapter 2 - How Does Bayesian Inference Work?
Parts needed for Bayesian inference:
Using a generative model:
Repressing uncertainty with priors:
Bayesian models and conditioning:
Chapter wrap-up:
Example code includes:
# Generative zombie drug model
# Parameters
prop_success <- 0.42
n_zombies <- 100
# Simulating data
data <- c()
for(zombie in 1:n_zombies) {
data[zombie] <- runif(1, min = 0, max = 1) < prop_success
}
data <- as.numeric(data)
data
## [1] 1 1 1 1 0 1 0 1 1 1 0 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 0 1 0 1 1 1 1 1
## [36] 0 1 0 0 0 1 0 0 0 1 1 1 1 1 0 1 0 1 1 1 0 0 1 0 1 0 1 1 1 0 1 1 0 0 0
## [71] 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0
data_counts <- sum(as.numeric(data))
data_counts
## [1] 46
# Try out rbinom
rbinom(n = 1, size = 100, prob = 0.42)
## [1] 31
# Try out rbinom
rbinom(n = 200, size = 100, prob = 0.42)
## [1] 44 51 40 48 44 37 41 41 50 49 44 42 43 41 41 33 39 40 41 39 46 43 45
## [24] 41 42 38 41 45 45 39 41 48 45 45 43 41 36 39 45 44 37 37 39 51 43 31
## [47] 42 39 42 42 38 46 36 41 48 42 41 37 41 38 36 43 53 37 49 35 43 51 34
## [70] 44 40 44 39 47 47 40 43 38 41 38 40 36 35 37 37 40 41 47 39 39 45 45
## [93] 47 44 36 49 33 40 44 39 40 36 42 38 47 39 48 48 50 42 51 39 44 42 45
## [116] 38 44 40 40 41 40 46 43 40 46 49 40 48 43 43 39 39 41 44 45 43 43 41
## [139] 48 38 47 42 45 38 44 44 38 40 36 38 46 41 42 29 40 42 40 45 33 46 54
## [162] 47 36 49 41 41 39 40 44 43 52 41 38 38 45 49 49 39 47 40 42 37 43 46
## [185] 42 41 47 48 38 50 42 40 38 36 49 39 46 37 42 36
# Fill in the parameters
n_samples <- 100000
n_ads_shown <- 100
proportion_clicks <- 0.1
n_visitors <- rbinom(n_samples, size = n_ads_shown, prob = proportion_clicks)
# Visualize the results
hist(n_visitors)
# Update proportion_clicks
n_samples <- 100000
n_ads_shown <- 100
proportion_clicks <- runif(n = n_samples, min = 0.0, max = 0.2)
n_visitors <- rbinom(n = n_samples, size = n_ads_shown, prob = proportion_clicks)
# Visualize the results
hist(n_visitors)
hist(proportion_clicks)
# Create prior
prior <- data.frame(proportion_clicks, n_visitors)
head(prior)
## proportion_clicks n_visitors
## 1 0.102175374 6
## 2 0.066737942 6
## 3 0.007708941 0
## 4 0.123223223 13
## 5 0.070207388 6
## 6 0.112998033 7
# Create posterior
posterior <- prior[prior$n_visitors==13, ]
hist(posterior$proportion_clicks)
prior <- posterior
head(prior)
## proportion_clicks n_visitors
## 4 0.1232232 13
## 21 0.1409067 13
## 44 0.0885686 13
## 49 0.1316969 13
## 61 0.1619553 13
## 76 0.1349985 13
prior$n_visitors <- rbinom(nrow(prior), size=100, prob=prior$proportion_clicks)
hist(prior$n_visitors)
mean(prior$n_visitors >= 5)
## [1] 0.9867033
Chapter 3 - Why Use Bayesian Data Analysis?
Four good things with Bayes:
Contrasts and comparisons:
Decision analysis:
Change anything and everything:
Bayes is optimal, kind of . . .
Example code includes:
# Draw from the beta distribution
beta_sample <- rbeta(n = 1000000, shape1 = 1, shape2 = 1)
# Explore the results
hist(beta_sample)
# Draw from the beta distribution
beta_sample <- rbeta(n = 10000, shape1 = 100, shape2 = 100)
# Explore the results
hist(beta_sample)
# Draw from the beta distribution
beta_sample <- rbeta(n = 10000, shape1 = 100, shape2 = 20)
# Explore the results
hist(beta_sample)
n_draws <- 100000
n_ads_shown <- 100
# Update proportion_clicks
proportion_clicks <- rbeta(n_draws, shape1 = 5, shape2 = 95)
n_visitors <- rbinom(n_draws, size = n_ads_shown, prob = proportion_clicks)
prior <- data.frame(proportion_clicks, n_visitors)
posterior <- prior[prior$n_visitors == 13, ]
# Plots the prior and the posterior in the same plot
par(mfcol = c(2, 1))
hist(prior$proportion_clicks,
xlim = c(0, 0.25))
hist(posterior$proportion_clicks,
xlim = c(0, 0.25))
# Reset mfcol below
# Define parameters
n_draws <- 100000
n_ads_shown <- 100
proportion_clicks <- runif(n_draws, min = 0.0, max = 0.2)
n_visitors <- rbinom(n = n_draws, size = n_ads_shown, prob = proportion_clicks)
prior <- data.frame(proportion_clicks, n_visitors)
# Create posteriors
posterior_video <- prior[prior$n_visitors == 13, ]
posterior_text <- prior[prior$n_visitors == 6, ]
# Visualize posteriors
hist(posterior_video$proportion_clicks, xlim = c(0, 0.25))
hist(posterior_text$proportion_clicks, xlim = c(0, 0.25))
posterior <- data.frame(video_prop = posterior_video$proportion_clicks[1:4000],
text_prop = posterior_text$proportion_click[1:4000]
)
# Create prop_diff
posterior$prop_diff <- posterior$video_prop - posterior$text_prop
# Plot your new column
hist(posterior$prop_diff)
# Explore prop_diff
median(posterior$prop_diff)
## [1] 0.06583102
mean(posterior$prop_diff > 0)
## [1] 0.947
visitor_spend <- 2.53
video_cost <- 0.25
text_cost <- 0.05
posterior$video_profit <- posterior$video_prop * visitor_spend - video_cost
posterior$text_profit <- posterior$text_prop * visitor_spend - text_cost
head(posterior)
## video_prop text_prop prop_diff video_profit text_profit
## 1 0.11438338 0.06966232 0.0447210643 0.03938996 0.12624567
## 2 0.08828099 0.08925227 -0.0009712789 -0.02664909 0.17580825
## 3 0.13337523 0.03383959 0.0995356356 0.08743932 0.03561416
## 4 0.11821430 0.08126194 0.0369523543 0.04908217 0.15559272
## 5 0.10437450 0.07691290 0.0274615984 0.01406748 0.14458963
## 6 0.13520477 0.07912402 0.0560807510 0.09206808 0.15018378
hist(posterior$video_profit)
hist(posterior$text_profit)
posterior$profit_diff <- posterior$video_profit - posterior$text_profit
head(posterior)
## video_prop text_prop prop_diff video_profit text_profit profit_diff
## 1 0.11438338 0.06966232 0.0447210643 0.03938996 0.12624567 -0.08685571
## 2 0.08828099 0.08925227 -0.0009712789 -0.02664909 0.17580825 -0.20245734
## 3 0.13337523 0.03383959 0.0995356356 0.08743932 0.03561416 0.05182516
## 4 0.11821430 0.08126194 0.0369523543 0.04908217 0.15559272 -0.10651054
## 5 0.10437450 0.07691290 0.0274615984 0.01406748 0.14458963 -0.13052216
## 6 0.13520477 0.07912402 0.0560807510 0.09206808 0.15018378 -0.05811570
hist(posterior$profit_diff)
median(posterior$profit_diff)
## [1] -0.03344751
mean(posterior$profit_diff < 0)
## [1] 0.6345
x <- rpois(n = 10000, lambda = 3)
hist(x)
x <- rpois(n = 10000, lambda = 11.5)
hist(x)
x <- rpois(n = 10000, lambda = 11.5)
mean(x >= 15)
## [1] 0.182
n_draws <- 100000
n_ads_shown <- 100
mean_clicks <- runif(n_draws, min = 0, max = 80)
n_visitors <- rpois(n_draws, lambda=mean_clicks)
prior <- data.frame(mean_clicks, n_visitors)
posterior <- prior[prior$n_visitors == 19, ]
hist(prior$mean_clicks)
hist(posterior$mean_clicks)
# Reset to default
par(mfcol = c(1, 1))
Chapter 4 - Bayesian Inference with Bayes’ Theorem
Probability rules:
Calculating likelihoods:
Bayesian calculation:
Bayes theorem:
Example code includes:
prob_to_draw_ace <- 4 / 52
prob_to_draw_four_aces <- (4 / 52) * (3 / 51) * (2 / 50) * (1 / 49)
n_ads_shown <- 100
proportion_clicks <- 0.1
n_visitors <- rbinom(n = 99999,
size = n_ads_shown, prob = proportion_clicks)
prob_13_visitors <- sum(n_visitors == 13) / length(n_visitors)
prob_13_visitors
## [1] 0.07536075
prob_13_visitors <- dbinom(x=13, size=n_ads_shown, prob=proportion_clicks)
prob_13_visitors
## [1] 0.07430209
n_ads_shown <- 100
proportion_clicks <- 0.1
n_visitors <- 0:n_ads_shown
prob <- dbinom(n_visitors,
size = n_ads_shown, prob = proportion_clicks)
prob
## [1] 2.656140e-05 2.951267e-04 1.623197e-03 5.891602e-03 1.587460e-02
## [6] 3.386580e-02 5.957873e-02 8.889525e-02 1.148230e-01 1.304163e-01
## [11] 1.318653e-01 1.198776e-01 9.878801e-02 7.430209e-02 5.130383e-02
## [16] 3.268244e-02 1.929172e-02 1.059153e-02 5.426525e-03 2.602193e-03
## [21] 1.170987e-03 4.956559e-04 1.977617e-04 7.451890e-05 2.656461e-05
## [26] 8.972934e-06 2.875940e-06 8.758007e-07 2.537042e-07 6.998736e-08
## [31] 1.840408e-08 4.617512e-09 1.106279e-09 2.532895e-10 5.545880e-11
## [36] 1.161994e-11 2.331161e-12 4.480309e-13 8.253201e-14 1.457830e-14
## [41] 2.470212e-15 4.016606e-16 6.269305e-17 9.395858e-18 1.352434e-18
## [46] 1.870032e-19 2.484342e-20 3.171501e-21 3.890962e-22 4.587982e-23
## [51] 5.199713e-24 5.664175e-25 5.930440e-26 5.967739e-27 5.771270e-28
## [56] 5.363200e-29 4.788572e-30 4.107157e-31 3.383290e-32 2.676049e-33
## [61] 2.031815e-34 1.480375e-35 1.034671e-36 6.934301e-38 4.454325e-39
## [66] 2.741123e-40 1.615140e-41 9.106926e-43 4.910597e-44 2.530420e-45
## [71] 1.245128e-46 5.845669e-48 2.616117e-49 1.114936e-50 4.520010e-52
## [76] 1.741041e-53 6.363454e-55 2.203794e-56 7.220406e-58 2.234162e-59
## [81] 6.516307e-61 1.787738e-62 4.602579e-64 1.109055e-65 2.493907e-67
## [86] 5.216014e-69 1.010856e-70 1.807405e-72 2.966699e-74 4.444493e-76
## [91] 6.035732e-78 7.369636e-80 8.010474e-82 7.656367e-84 6.335055e-86
## [96] 4.445653e-88 2.572716e-90 1.178793e-92 4.009500e-95 9.000000e-98
## [101] 1.000000e-100
plot(x=n_visitors, y=prob, type="h")
n_ads_shown <- 100
proportion_clicks <- seq(0, 1, by = 0.01)
n_visitors <- 13
prob <- dbinom(n_visitors,
size = n_ads_shown, prob = proportion_clicks)
prob
## [1] 0.000000e+00 2.965956e-11 1.004526e-07 8.009768e-06 1.368611e-04
## [6] 1.001075e-03 4.265719e-03 1.247940e-02 2.764481e-02 4.939199e-02
## [11] 7.430209e-02 9.703719e-02 1.125256e-01 1.178532e-01 1.129620e-01
## [16] 1.001234e-01 8.274855e-02 6.419966e-02 4.701652e-02 3.265098e-02
## [21] 2.158348e-02 1.362418e-02 8.234325e-03 4.775927e-03 2.663369e-03
## [26] 1.430384e-03 7.408254e-04 3.704422e-04 1.790129e-04 8.366678e-05
## [31] 3.784500e-05 1.657584e-05 7.032793e-06 2.891291e-06 1.151996e-06
## [36] 4.448866e-07 1.665302e-07 6.041614e-08 2.124059e-08 7.234996e-09
## [41] 2.386939e-09 7.624614e-10 2.357105e-10 7.048636e-11 2.037726e-11
## [46] 5.691404e-12 1.534658e-12 3.991862e-13 1.000759e-13 2.415778e-14
## [51] 5.609229e-15 1.251336e-15 2.678760e-16 5.495443e-17 1.078830e-17
## [56] 2.023515e-18 3.620178e-19 6.166397e-20 9.980560e-21 1.531703e-21
## [61] 2.223762e-22 3.046572e-23 3.927965e-24 4.752038e-25 5.377247e-26
## [66] 5.671478e-27 5.554432e-28 5.030231e-29 4.193404e-30 3.201904e-31
## [71] 2.227032e-32 1.402449e-33 7.942805e-35 4.015572e-36 1.797200e-37
## [76] 7.054722e-39 2.403574e-40 7.024314e-42 1.737424e-43 3.582066e-45
## [81] 6.048981e-47 8.199196e-49 8.713462e-51 7.062754e-53 4.226413e-55
## [86] 1.795925e-57 5.170371e-60 9.521923e-63 1.044590e-65 6.239308e-69
## [91] 1.807405e-72 2.180415e-76 8.911963e-81 9.240821e-86 1.591196e-91
## [96] 2.358848e-98 1.001493e-106 1.546979e-117 8.461578e-133 6.239651e-159
## [101] 0.000000e+00
plot(x=proportion_clicks, y=prob, type="h")
n_ads_shown <- 100
proportion_clicks <- seq(0, 1, by = 0.01)
n_visitors <- seq(0, 100, by = 1)
pars <- expand.grid(proportion_clicks = proportion_clicks,
n_visitors = n_visitors)
pars$prior <- dunif(pars$proportion_clicks, min = 0, max = 0.2)
pars$likelihood <- dbinom(pars$n_visitors,
size = n_ads_shown, prob = pars$proportion_clicks)
pars$probability <- pars$likelihood * pars$prior
pars$probability <- pars$probability / sum(pars$probability)
pars_conditioned <- pars[pars$n_visitors==6, ]
pars_conditioned$probability <- pars_conditioned$probability / sum(pars_conditioned$probability)
plot(x=pars_conditioned$proportion_clicks, y=pars_conditioned$probability, type="h")
# Simplify slightly for a known result of 6
n_ads_shown <- 100
proportion_clicks <- seq(0, 1, by = 0.01)
n_visitors <- 6
pars <- expand.grid(proportion_clicks = proportion_clicks,
n_visitors = n_visitors)
pars$prior <- dunif(pars$proportion_clicks, min = 0, max = 0.2)
pars$likelihood <- dbinom(pars$n_visitors,
size = n_ads_shown, prob = pars$proportion_clicks)
pars$probability <- pars$likelihood * pars$prior
pars$probability <- pars$probability / sum(pars$probability)
plot(pars$proportion_clicks, pars$probability, type = "h")
Chapter 5 - More Parameters, Data, and Bayes
Temperature in a normal lake:
Bayesian model of water temperature:
Beach party implications of water temperatures:
Practical tool (BEST):
Wrap and up and next steps:
Example code includes:
mu <- 3500
sigma <- 600
weight_distr <- rnorm(n = 100000, mean = mu, sd = sigma)
hist(weight_distr, xlim = c(0, 6000), col = "lightgreen")
mu <- 3500
sigma <- 600
weight <- seq(0, 6000, by=100)
likelihood <- dnorm(weight, mean=mu, sd=sigma)
plot(x=weight, y=likelihood, type="h")
# The IQ of a bunch of zombies
iq <- c(55, 44, 34, 18, 51, 40, 40, 49, 48, 46)
# Defining the parameter grid
pars <- expand.grid(mu = seq(0, 150, length.out = 100),
sigma = seq(0.1, 50, length.out = 100))
# Defining and calculating the prior density for each parameter combination
pars$mu_prior <- dnorm(pars$mu, mean = 100, sd = 100)
pars$sigma_prior <- dunif(pars$sigma, min = 0.1, max = 50)
pars$prior <- pars$mu_prior * pars$sigma_prior
# Calculating the likelihood for each parameter combination
for(i in 1:nrow(pars)) {
likelihoods <- dnorm(iq, pars$mu[i], pars$sigma[i])
pars$likelihood[i] <- prod(likelihoods)
}
# Calculating the probability of each parameter combination
pars$probability <- pars$likelihood * pars$prior / sum(pars$likelihood * pars$prior)
lattice::levelplot(probability ~ mu * sigma, data = pars)
head(pars)
## mu sigma mu_prior sigma_prior prior likelihood
## 1 0.000000 0.1 0.002419707 0.02004008 4.849113e-05 0
## 2 1.515152 0.1 0.002456367 0.02004008 4.922578e-05 0
## 3 3.030303 0.1 0.002493009 0.02004008 4.996010e-05 0
## 4 4.545455 0.1 0.002529617 0.02004008 5.069373e-05 0
## 5 6.060606 0.1 0.002566174 0.02004008 5.142633e-05 0
## 6 7.575758 0.1 0.002602661 0.02004008 5.215754e-05 0
## probability
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
sample_indices <- sample( nrow(pars), size = 10000,
replace = TRUE, prob = pars$probability)
head(sample_indices)
## [1] 2827 2728 3025 3126 4035 4727
pars_sample <- pars[sample_indices, c("mu", "sigma")]
hist(pars_sample$mu)
quantile(pars_sample$mu, c(0.025, 0.5, 0.975))
## 2.5% 50% 97.5%
## 34.84848 42.42424 50.00000
head(pars_sample)
## mu sigma
## 2827 39.39394 14.21313
## 2728 40.90909 13.70909
## 3025 36.36364 15.22121
## 3126 37.87879 15.72525
## 4035 51.51515 20.26162
## 4727 39.39394 23.78990
pred_iq <- rnorm(10000, mean = pars_sample$mu, sd = pars_sample$sigma)
hist(pred_iq)
mean(pred_iq >= 60)
## [1] 0.0886
# The IQ of zombies on a regular diet and a brain based diet.
iq_brains <- c(44, 52, 42, 66, 53, 42, 55, 57, 56, 51)
iq_regular <- c(55, 44, 34, 18, 51, 40, 40, 49, 48, 46)
mean(iq_brains) - mean(iq_regular)
## [1] 9.3
# Need to load http://www.sourceforge.net/projects/mcmc-jags/files for rjags (called by BEST)
# library(BEST)
# best_posterior <- BESTmcmc(iq_brains, iq_regular)
# plot(best_posterior)
Chapter 1 - Introduction to Factor Variables
Introduction to qualitative variables:
Understanding your qualitative variables:
Making better plots:
Example code includes:
multiple_choice_answers <- readr::read_csv("./RInputFiles/smc_with_js.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## Age = col_integer()
## )
## See spec(...) for full column specifications.
# Print out the dataset
glimpse(multiple_choice_answers)
## Observations: 16,716
## Variables: 47
## $ LearningPlatformUsefulnessArxiv <chr> NA, NA, "Very usef...
## $ LearningPlatformUsefulnessBlogs <chr> NA, NA, NA, "Very ...
## $ LearningPlatformUsefulnessCollege <chr> NA, NA, "Somewhat ...
## $ LearningPlatformUsefulnessCompany <chr> NA, NA, NA, NA, NA...
## $ LearningPlatformUsefulnessConferences <chr> "Very useful", NA,...
## $ LearningPlatformUsefulnessFriends <chr> NA, NA, NA, "Very ...
## $ LearningPlatformUsefulnessKaggle <chr> NA, "Somewhat usef...
## $ LearningPlatformUsefulnessNewsletters <chr> NA, NA, NA, NA, NA...
## $ LearningPlatformUsefulnessCommunities <chr> NA, NA, NA, NA, NA...
## $ LearningPlatformUsefulnessDocumentation <chr> NA, NA, NA, "Very ...
## $ LearningPlatformUsefulnessCourses <chr> NA, NA, "Very usef...
## $ LearningPlatformUsefulnessProjects <chr> NA, NA, NA, "Very ...
## $ LearningPlatformUsefulnessPodcasts <chr> "Very useful", NA,...
## $ LearningPlatformUsefulnessSO <chr> NA, NA, NA, NA, NA...
## $ LearningPlatformUsefulnessTextbook <chr> NA, NA, NA, NA, "S...
## $ LearningPlatformUsefulnessTradeBook <chr> "Somewhat useful",...
## $ LearningPlatformUsefulnessTutoring <chr> NA, NA, NA, NA, NA...
## $ LearningPlatformUsefulnessYouTube <chr> NA, NA, "Very usef...
## $ CurrentJobTitleSelect <chr> "DBA/Database Engi...
## $ MLMethodNextYearSelect <chr> "Random Forests", ...
## $ WorkChallengeFrequencyPolitics <chr> "Rarely", NA, NA, ...
## $ WorkChallengeFrequencyUnusedResults <chr> NA, NA, NA, "Often...
## $ WorkChallengeFrequencyUnusefulInstrumenting <chr> NA, NA, NA, "Often...
## $ WorkChallengeFrequencyDeployment <chr> NA, NA, NA, "Often...
## $ WorkChallengeFrequencyDirtyData <chr> NA, NA, NA, "Often...
## $ WorkChallengeFrequencyExplaining <chr> NA, NA, NA, "Often...
## $ WorkChallengeFrequencyPass <chr> NA, NA, NA, NA, NA...
## $ WorkChallengeFrequencyIntegration <chr> NA, NA, NA, "Often...
## $ WorkChallengeFrequencyTalent <chr> NA, NA, NA, "Often...
## $ WorkChallengeFrequencyDataFunds <chr> NA, NA, NA, "Often...
## $ WorkChallengeFrequencyDomainExpertise <chr> NA, NA, NA, "Most ...
## $ WorkChallengeFrequencyML <chr> NA, NA, NA, "Often...
## $ WorkChallengeFrequencyTools <chr> NA, NA, NA, "Often...
## $ WorkChallengeFrequencyExpectations <chr> NA, NA, NA, "Often...
## $ WorkChallengeFrequencyITCoordination <chr> NA, NA, NA, NA, "S...
## $ WorkChallengeFrequencyHiringFunds <chr> NA, NA, NA, "Often...
## $ WorkChallengeFrequencyPrivacy <chr> "Often", NA, NA, "...
## $ WorkChallengeFrequencyScaling <chr> "Most of the time"...
## $ WorkChallengeFrequencyEnvironments <chr> NA, NA, NA, "Often...
## $ WorkChallengeFrequencyClarity <chr> NA, NA, NA, "Often...
## $ WorkChallengeFrequencyDataAccess <chr> NA, NA, NA, "Often...
## $ WorkChallengeFrequencyOtherSelect <chr> NA, NA, NA, NA, NA...
## $ WorkInternalVsExternalTools <chr> "Do not know", NA,...
## $ FormalEducation <chr> "Bachelor's degree...
## $ Age <int> NA, 30, 28, 56, 38...
## $ DataScienceIdentitySelect <chr> "Yes", "Yes", "Yes...
## $ JobSatisfaction <chr> "5", NA, NA, "10 -...
# Check if CurrentJobTitleSelect is a factor
is.factor(multiple_choice_answers$CurrentJobTitleSelect)
## [1] FALSE
# mutate() and summarise() in dplyr both have variants where you can add the suffix if, all, or at to change the operation
# mutate_if() applies a function to all columns where the first argument is true
# mutate_all() applies a function to all columns
# mutate_at() affects columns selected with a character vector or select helpers (e.g. mutate_at(c("height", "weight"), log))
# Change all the character columns to factors
responses_as_factors <- multiple_choice_answers %>%
mutate_if(is.character, as.factor)
# Make a two column dataset with variable names and number of levels
number_of_levels <- responses_as_factors %>%
summarise_all(nlevels) %>%
gather(variable, num_levels)
# dplyr has two other functions that can come in handy when exploring a dataset
# The first is top_n(x, var), which gets us the first x rows of a dataset based on the value of var
# The other is pull(), which allows us to extract a column and take out the name, leaving only the value(s) from the column
# Select the 4 rows with the highest number of levels
number_of_levels %>%
top_n(4, num_levels)
## # A tibble: 4 x 2
## variable num_levels
## <chr> <int>
## 1 CurrentJobTitleSelect 16
## 2 MLMethodNextYearSelect 25
## 3 FormalEducation 7
## 4 JobSatisfaction 11
# How many levels does CurrentJobTitleSelect have?
number_of_levels %>%
filter(variable=="CurrentJobTitleSelect") %>%
pull(num_levels)
## [1] 16
# Get the names of the levels of CurrentJobTitleSelect
responses_as_factors %>%
pull(CurrentJobTitleSelect) %>%
levels()
## [1] "Business Analyst"
## [2] "Computer Scientist"
## [3] "Data Analyst"
## [4] "Data Miner"
## [5] "Data Scientist"
## [6] "DBA/Database Engineer"
## [7] "Engineer"
## [8] "Machine Learning Engineer"
## [9] "Operations Research Practitioner"
## [10] "Other"
## [11] "Predictive Modeler"
## [12] "Programmer"
## [13] "Researcher"
## [14] "Scientist/Researcher"
## [15] "Software Developer/Software Engineer"
## [16] "Statistician"
# Make a bar plot
ggplot(multiple_choice_answers, aes(x=FormalEducation)) +
geom_bar() +
coord_flip()
# Make a bar plot
ggplot(multiple_choice_answers, aes(x=fct_rev(fct_infreq(FormalEducation)))) +
geom_bar() +
coord_flip()
multiple_choice_answers %>%
filter(!is.na(Age) & !is.na(FormalEducation)) %>%
group_by(FormalEducation) %>%
summarize(mean_age = mean(Age)) %>%
ggplot(aes(x = fct_reorder(FormalEducation, mean_age), y = mean_age)) +
geom_point() +
coord_flip()
Chapter 2 - Manipulating Factor Variables
Reordering factors:
Renaming factor levels:
Collapsing factor levels:
Example code includes:
multiple_choice_responses <- multiple_choice_answers
# Print the levels of WorkInternalVsExternalTools
levels(multiple_choice_responses$WorkInternalVsExternalTools)
## NULL
# Reorder the levels from internal to external
mc_responses_reordered <- multiple_choice_responses %>%
mutate(WorkInternalVsExternalTools = fct_relevel(WorkInternalVsExternalTools,
c('Entirely internal', 'More internal than external',
'Approximately half internal and half external',
'More external than internal', 'Entirely external',
'Do not know'
)
)
)
# Make a bar plot of the responses
ggplot(mc_responses_reordered, aes(x=WorkInternalVsExternalTools)) +
geom_bar() +
coord_flip()
multiple_choice_responses %>%
# Move "I did not complete any formal education past high school" and "Some college/university study without earning a bachelor's degree" to the front
mutate(FormalEducation = fct_relevel(FormalEducation, c("I did not complete any formal education past high school", "Some college/university study without earning a bachelor's degree"))) %>%
# Move "Doctoral degree" to be the sixth level
mutate(FormalEducation = fct_relevel(FormalEducation, after=6, "Doctoral degree")) %>%
# Move "I prefer not to answer" to be the last level.
mutate(FormalEducation = fct_relevel(FormalEducation, after=Inf, "I prefer not to answer")) %>%
# Examine the new level order
pull(FormalEducation) %>%
levels()
## [1] "I did not complete any formal education past high school"
## [2] "Some college/university study without earning a bachelor's degree"
## [3] "Bachelor's degree"
## [4] "Master's degree"
## [5] "Professional degree"
## [6] "Doctoral degree"
## [7] "I prefer not to answer"
# make a bar plot of the frequency of FormalEducation
ggplot(multiple_choice_responses, aes(x=FormalEducation)) +
geom_bar()
multiple_choice_responses %>%
# rename levels
mutate(FormalEducation = fct_recode(FormalEducation, "High school" ="I did not complete any formal education past high school", "Some college" = "Some college/university study without earning a bachelor's degree")) %>%
# make a bar plot of FormalEducation
ggplot(aes(x=FormalEducation)) +
geom_bar()
multiple_choice_responses %>%
# Create new variable, grouped_titles, by collapsing levels in CurrentJobTitleSelect
mutate(grouped_titles = fct_collapse(CurrentJobTitleSelect,
"Computer Scientist" = c("Programmer", "Software Developer/Software Engineer"),
"Researcher" = "Scientist/Researcher",
"Data Analyst/Scientist/Engineer" = c("DBA/Database Engineer", "Data Scientist",
"Business Analyst", "Data Analyst",
"Data Miner", "Predictive Modeler"))) %>%
# Turn every title that isn't now one of the grouped_titles into "Other"
mutate(grouped_titles = fct_other(grouped_titles,
keep = c("Computer Scientist",
"Researcher",
"Data Analyst/Scientist/Engineer"))) %>%
# Get a count of the grouped titles
count(grouped_titles)
## # A tibble: 5 x 2
## grouped_titles n
## <fct> <int>
## 1 Data Analyst/Scientist/Engineer 4928
## 2 Computer Scientist 2556
## 3 Researcher 1597
## 4 Other 2749
## 5 <NA> 4886
multiple_choice_responses %>%
# remove NAs of MLMethodNextYearSelect
filter(!is.na(MLMethodNextYearSelect)) %>%
# create ml_method, which lumps all those with less than 5% of people into "Other"
mutate(ml_method = fct_lump(MLMethodNextYearSelect, prop=0.05)) %>%
# print the frequency of your new variable in descending order
count(ml_method, sort=TRUE)
## # A tibble: 4 x 2
## ml_method n
## <fct> <int>
## 1 Other 4405
## 2 Deep learning 4362
## 3 Neural Nets 1386
## 4 Time Series Analysis 680
multiple_choice_responses %>%
# remove NAs
filter(!is.na(MLMethodNextYearSelect)) %>%
# create ml_method, retaining the 5 most common methods and renaming others "other method"
mutate(ml_method = fct_lump(MLMethodNextYearSelect, 5, other_level="other method")) %>%
# print the frequency of your new variable in descending order
count(ml_method, sort=TRUE)
## # A tibble: 6 x 2
## ml_method n
## <fct> <int>
## 1 Deep learning 4362
## 2 other method 3401
## 3 Neural Nets 1386
## 4 Time Series Analysis 680
## 5 Bayesian Methods 511
## 6 Text Mining 493
Chapter 3 - Creating Factor Variables
Examining common themed variables:
Tricks of ggplot2:
Changing and creating variables with dplyr::case_when():
Example code includes:
learning_platform_usefulness <- multiple_choice_responses %>%
# select columns with LearningPlatformUsefulness in title
select(contains("LearningPlatformUsefulness")) %>%
# change data from wide to long
gather(learning_platform, usefulness) %>%
# remove rows where usefulness is NA
filter(!is.na(usefulness)) %>%
# remove "LearningPlatformUsefulness" from each string in `learning_platform
mutate(learning_platform = str_remove(learning_platform, "LearningPlatformUsefulness"))
learning_platform_usefulness %>%
# change dataset to one row per learning_platform usefulness pair with number of entries for each
count(learning_platform, usefulness) %>%
# use add_count to create column with total number of answers for that learning_platform
add_count(learning_platform, wt=n) %>%
# create a line graph for each question with usefulness on x-axis and percentage of responses on y
ggplot(aes(x = usefulness, y = n/nn, group = learning_platform)) +
geom_line() +
facet_wrap(~ learning_platform)
avg_usefulness <- learning_platform_usefulness %>%
# If usefulness is "Not Useful", make 0, else 1
mutate(usefulness = ifelse(usefulness=="Not Useful", 0, 1)) %>%
# Get the average usefulness by learning platform
group_by(learning_platform) %>%
summarize(avg_usefulness = mean(usefulness))
# Make a scatter plot of average usefulness by learning platform
ggplot(avg_usefulness, aes(x=learning_platform, y=avg_usefulness)) +
geom_point()
ggplot(avg_usefulness, aes(x = learning_platform, y = avg_usefulness)) +
geom_point() +
# rotate x-axis text by 90 degrees
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
# rename y and x axis labels
labs(x="Learning Platform", y="Percent finding at least somewhat useful") +
# change y axis scale to percentage
scale_y_continuous(labels = scales::percent)
ggplot(avg_usefulness,
aes(x = fct_rev(fct_reorder(learning_platform, avg_usefulness)), y = avg_usefulness)
) +
geom_point() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "Learning Platform", y = "Percent finding at least somewhat useful") +
scale_y_continuous(labels = scales::percent)
# Check the min age
min(multiple_choice_responses$Age, na.rm=TRUE)
## [1] 0
# Check the max age
max(multiple_choice_responses$Age, na.rm=TRUE)
## [1] 100
sum(is.na(multiple_choice_responses$Age))
## [1] 331
multiple_choice_responses %>%
# Eliminate any ages below 10 and above 90
filter(between(Age, 10, 90)) %>%
# Create the generation variable based on age
mutate(generation=case_when(
between(Age, 10, 22) ~ "Gen Z",
between(Age, 23, 37) ~ "Gen Y",
between(Age, 38, 52) ~ "Gen X",
between(Age, 53, 71) ~ "Baby Boomer",
between(Age, 72, 90) ~ "Silent"
)) %>%
# Get a count of how many answers in each generation
count(generation)
## # A tibble: 5 x 2
## generation n
## <chr> <int>
## 1 Baby Boomer 832
## 2 Gen X 3162
## 3 Gen Y 10281
## 4 Gen Z 2037
## 5 Silent 37
multiple_choice_responses %>%
# Filter out people who selected Data Scientist as their Job Title
filter(!is.na(CurrentJobTitleSelect) & CurrentJobTitleSelect != "Data Scientist") %>%
# Create a new variable, job_identity
mutate(job_identity = case_when(
CurrentJobTitleSelect == "Data Analyst" & DataScienceIdentitySelect == "Yes" ~ "DS analysts",
CurrentJobTitleSelect == "Data Analyst" & DataScienceIdentitySelect %in% c("No", "Sort of (Explain more)") ~ "NDS analyst",
CurrentJobTitleSelect != "Data Analyst" & DataScienceIdentitySelect == "Yes" ~ "DS non-analysts",
TRUE ~ "NDS non analysts")
) %>%
mutate(JobSat=case_when(
is.na(JobSatisfaction) ~ NA_integer_,
JobSatisfaction == "I prefer not to share" | JobSatisfaction == "NA" ~ NA_integer_,
JobSatisfaction == "1 - Highly Dissatisfied" ~ 1L,
JobSatisfaction == "10 - Highly Satisfied" ~ 10L,
TRUE ~ as.integer(JobSatisfaction))) %>%
# Get the average job satisfaction by job_identity, removing NAs
group_by(job_identity) %>%
summarize(avg_js = mean(JobSat, na.rm=TRUE))
## Warning in eval_bare(f[[3]], env): NAs introduced by coercion
## # A tibble: 4 x 2
## job_identity avg_js
## <chr> <dbl>
## 1 DS analysts 6.44
## 2 DS non-analysts 6.93
## 3 NDS analyst 6.14
## 4 NDS non analysts 6.43
Chapter 4 - Case Study on Flight Etiquette
Case study introduction:
Data preparation and regex:
Recreating the plot:
End of course recap:
Example code includes:
flying_etiquette <- read.csv("./RInputFiles/flying-etiquette.csv", stringsAsFactors = FALSE)
names(flying_etiquette) <-
stringr::str_replace_all(stringr::str_replace_all(names(flying_etiquette), "\\.", " "), " ", " ")
names(flying_etiquette) <- stringr::str_trim(names(flying_etiquette))
names(flying_etiquette)[2:22] <- paste0(names(flying_etiquette)[2:22], "?")
names(flying_etiquette) <- stringr::str_replace_all(names(flying_etiquette), "itrude", "it rude")
glimpse(flying_etiquette)
## Observations: 1,040
## Variables: 27
## $ RespondentID <dbl> ...
## $ `How often do you travel by plane?` <chr> ...
## $ `Do you ever recline your seat when you fly?` <chr> ...
## $ `How tall are you?` <chr> ...
## $ `Do you have any children under 18?` <chr> ...
## $ `In a row of three seats who should get to use the two arm rests?` <chr> ...
## $ `In a row of two seats who should get to use the middle arm rest?` <chr> ...
## $ `Who should have control over the window shade?` <chr> ...
## $ `Is it rude to move to an unsold seat on a plane?` <chr> ...
## $ `Generally speaking is it rude to say more than a few words tothe stranger sitting next to you on a plane?` <chr> ...
## $ `On a 6 hour flight from NYC to LA how many times is it acceptable to get up if you re not in an aisle seat?` <chr> ...
## $ `Under normal circumstances does a person who reclines their seat during a flight have any obligation to the person sitting behind them?` <chr> ...
## $ `Is it rude to recline your seat on a plane?` <chr> ...
## $ `Given the opportunity would you eliminate the possibility of reclining seats on planes entirely?` <chr> ...
## $ `Is it rude to ask someone to switch seats with you in order to be closer to friends?` <chr> ...
## $ `Is it rude to ask someone to switch seats with you in order to be closer to family?` <chr> ...
## $ `Is it rude to wake a passenger up if you are trying to go to the bathroom?` <chr> ...
## $ `Is it rude to wake a passenger up if you are trying to walk around?` <chr> ...
## $ `In general is it rude to bring a baby on a plane?` <chr> ...
## $ `In general is it rude to knowingly bring unruly children on a plane?` <chr> ...
## $ `Have you ever used personal electronics during take off or landing in violation of a flight attendant s direction?` <chr> ...
## $ `Have you ever smoked a cigarette in an airplane bathroom when it was against the rules?` <chr> ...
## $ Gender <chr> ...
## $ Age <chr> ...
## $ `Household Income` <chr> ...
## $ Education <chr> ...
## $ `Location Census Region` <chr> ...
gathered_data <- flying_etiquette %>%
mutate_if(is.character, as.factor) %>%
filter(`How often do you travel by plane?` != "Never") %>%
# Select columns containing "rude"
select(contains("rude")) %>%
# Change format from wide to long
gather(response_var, value)
## Warning: attributes are not identical across measure variables;
## they will be dropped
rude_behaviors <- gathered_data %>%
mutate(response_var = str_replace(response_var, '.*rude to ', '')) %>%
mutate(response_var = str_replace(response_var, 'on a plane', '')) %>%
mutate(rude = if_else(value %in% c("No, not rude at all", "No, not at all rude"), 0, 1)) %>%
# Create perc_rude, the percent considering each behavior rude
group_by(response_var) %>%
summarize(perc_rude=mean(rude))
rude_behaviors
## # A tibble: 9 x 2
## response_var perc_rude
## <chr> <dbl>
## 1 ask someone to switch seats with you in order to be closer to~ 0.193
## 2 ask someone to switch seats with you in order to be closer to~ 0.278
## 3 bring a baby ? 0.323
## 4 knowingly bring unruly children ? 0.832
## 5 move to an unsold seat ? 0.211
## 6 recline your seat ? 0.426
## 7 say more than a few words tothe stranger sitting next to you ? 0.228
## 8 wake a passenger up if you are trying to go to the bathroom? 0.388
## 9 wake a passenger up if you are trying to walk around? 0.741
# Create an ordered by plot of behavior by percentage considering it rude
initial_plot <- ggplot(rude_behaviors, aes(x=fct_reorder(response_var, perc_rude), y=perc_rude)) +
geom_col()
# View your plot
initial_plot
titled_plot <- initial_plot +
# Add the title, subtitle, and caption
labs(title = "Hell Is Other People In A Pressurized Metal Tube",
subtitle = "Percentage of 874 air-passenger respondents who said action is very or somewhat rude",
caption = "Source: SurveyMonkey Audience",
# Remove the x- and y-axis labels
x="",
y=""
)
titled_plot
flipped_plot <- titled_plot +
# Flip the axes
coord_flip() +
# Remove the x-axis ticks and labels
theme(axis.ticks.x = element_blank(),
axis.text.x = element_blank())
flipped_plot +
# Add labels above the bar with the perc value
geom_text(aes(label = paste0(round(100*perc_rude), "%"), y = perc_rude + .03),
position = position_dodge(0.9), vjust = 1)
Chapter 1 - Introduction to Bayesian Modeling
Prior model:
Data and likelihood:
Posterior model:
Example code includes:
# Make sure you have installed JAGS-4.x.y.exe (for any x >=0, y>=0) from http://www.sourceforge.net/projects/mcmc-jags/files
# Sample 10000 draws from Beta(45,55) prior
prior_A <- rbeta(n = 10000, shape1 = 45, shape2 = 55)
# Store the results in a data frame
prior_sim <- data.frame(prior_A)
# Construct a density plot of the prior sample
ggplot(prior_sim, aes(x = prior_A)) +
geom_density()
# Sample 10000 draws from the Beta(1,1) prior
prior_B <- rbeta(n = 10000, shape1 = 1, shape2 = 1)
# Sample 10000 draws from the Beta(100,100) prior
prior_C <- rbeta(n = 10000, shape1 = 100, shape2 = 100)
# Combine the results in a single data frame
prior_sim <- data.frame(samples = c(prior_A, prior_B, prior_C),
priors = rep(c("A","B","C"), each = 10000))
# Plot the 3 priors
ggplot(prior_sim, aes(x = samples, fill = priors)) +
geom_density(alpha = 0.5)
# Define a vector of 1000 p values
p_grid <- seq(0, 1, length.out=1000)
# Simulate 1 poll result for each p in p_grid
poll_result <- rbinom(1000, 10, prob=p_grid)
# Create likelihood_sim data frame
likelihood_sim <- data.frame(p_grid, poll_result)
# Density plots of p_grid grouped by poll_result
ggplot(likelihood_sim, aes(x = p_grid, y = poll_result, group = poll_result)) +
ggridges::geom_density_ridges()
# Density plots of p_grid grouped by poll_result
ggplot(likelihood_sim, aes(x = p_grid, y = poll_result, group = poll_result, fill = poll_result==6)) +
ggridges::geom_density_ridges()
# Keep the polls with X = 6
likelihood_sim_6 <- likelihood_sim %>%
filter(poll_result==6)
# Construct a density plot of the remaining p_grid values
ggplot(likelihood_sim_6, aes(x = p_grid)) +
geom_density() +
lims(x = c(0,1))
# DEFINE the model
vote_model <- "model{
# Likelihood model for X
X ~ dbin(p, n)
# Prior model for p
p ~ dbeta(a, b)
}"
# COMPILE the model
vote_jags <- jags.model(textConnection(vote_model),
data = list(a = 45, b = 55, X = 6, n = 10),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 100))
# SIMULATE the posterior
vote_sim <- coda.samples(model = vote_jags, variable.names = c("p"), n.iter = 10000)
# PLOT the posterior
plot(vote_sim, trace = FALSE)
# COMPILE the model
vote_jags <- jags.model(textConnection(vote_model),
data = list(a = 1, b = 1, X = 6, n = 10),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 100))
# SIMULATE the posterior
vote_sim <- coda.samples(model = vote_jags, variable.names = c("p"), n.iter = 10000)
# PLOT the posterior
plot(vote_sim, trace = FALSE, xlim = c(0,1), ylim = c(0,18))
# COMPILE the model
vote_jags <- jags.model(textConnection(vote_model),
data = list(a = 1, b = 1, X = 220, n = 400),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 100))
# SIMULATE the posterior
vote_sim <- coda.samples(model = vote_jags, variable.names = c("p"), n.iter = 10000)
# PLOT the posterior
plot(vote_sim, trace = FALSE, xlim = c(0,1), ylim = c(0,18))
# COMPILE the model
vote_jags <- jags.model(textConnection(vote_model),
data = list(a = 45, b = 55, X = 220, n = 400),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 100))
# SIMULATE the posterior
vote_sim <- coda.samples(model = vote_jags, variable.names = c("p"), n.iter = 10000)
# PLOT the posterior
plot(vote_sim, trace = FALSE, xlim = c(0,1), ylim = c(0,18))
Chapter 2 - Bayesian Models and Markov Chains
Normal-Normal Model:
Simulating Normal-Normal in RJAGS:
Y[i] ~ dnorm(m, s^(-2)) # requires precision which can be defined as the inverse of sigma-squared Markov chains:
Markov chain diagnostics and reproducibility:
Example code includes:
# Take 10000 samples from the m prior
prior_m <- rnorm(10000, 50, 25)
# Take 10000 samples from the s prior
prior_s <- runif(10000, 0, 200)
# Store samples in a data frame
samples <- data.frame(prior_m, prior_s)
# Density plots of the prior_m & prior_s samples
ggplot(samples, aes(x = prior_m)) +
geom_density()
ggplot(samples, aes(x = prior_s)) +
geom_density()
# Check out the first 6 rows of sleep_study
head(sleep_study)
# Define diff_3
sleep_study <- sleep_study %>%
mutate(diff_3=day_3-day_0)
# Histogram of diff_3
ggplot(sleep_study, aes(x = diff_3)) +
geom_histogram(binwidth = 20, color = "white")
# Mean and standard deviation of diff_3
sleep_study %>%
summarize(mean(diff_3), sd(diff_3))
# DEFINE the model
sleep_model <- "model{
# Likelihood model for Y[i]
for(i in 1:length(Y)) {
Y[i] ~ dnorm(m, s^(-2))
}
# Prior models for m and s
m ~ dnorm(50, 25^(-2))
s ~ dunif(0, 200)
}"
# COMPILE the model
sleep_jags <- jags.model(textConnection(sleep_model), data = list(Y = sleep_study$diff_3),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 1989))
# SIMULATE the posterior
sleep_sim <- coda.samples(model = sleep_jags, variable.names = c("m", "s"), n.iter = 10000)
# PLOT the posterior
plot(sleep_sim, trace = FALSE)
# Let m be the average change in reaction time after 3 days of sleep deprivation
# In a previous exercise, you obtained an approximate sample of 10,000 draws from the posterior model of m
# You stored the resulting mcmc.list object as sleep_sim which is loaded in your workspace:
# In fact, the sample of m values in sleep_sim is a dependent Markov chain, the distribution of which converges to the posterior
# You will examine the contents of sleep_sim and, to have finer control over your analysis, store the contents in a data frame
# Check out the head of sleep_sim
head(sleep_sim)
# Store the chains in a data frame
sleep_chains <- data.frame(sleep_sim[[1]], iter = 1:10000)
# Check out the head of sleep_chains
head(sleep_chains)
# NOTE: The 10,000 recorded Iterations start after a "burn-in" period in which samples are discarded
# Thus the Iterations count doesn't start at 1!
# Use plot() to construct trace plots of the m and s chains
plot(sleep_sim, density=FALSE)
# Use ggplot() to construct a trace plot of the m chain
ggplot(sleep_chains, aes(x = iter, y = m)) +
geom_line()
# Trace plot the first 100 iterations of the m chain
ggplot(dplyr::filter(sleep_chains, iter<=100), aes(x = iter, y = m)) + geom_line()
# Note that the longitudinal behavior of the chain appears quite random and that the trend remains relatively constant
# This is a good thing - it indicates that the Markov chain (likely) converges quickly to the posterior distribution of m
# Use plot() to construct density plots of the m and s chains
plot(sleep_sim, trace=FALSE)
# Use ggplot() to construct a density plot of the m chain
ggplot(sleep_chains, aes(x = m)) +
geom_density()
# Density plot of the first 100 values in the m chain
ggplot(dplyr::filter(sleep_chains, iter<=100), aes(x = m)) +
geom_density()
# COMPILE the model
sleep_jags_multi <- jags.model(textConnection(sleep_model), data = list(Y = sleep_study$diff_3), n.chains=4)
# SIMULATE the posterior
sleep_sim_multi <- coda.samples(model = sleep_jags_multi, variable.names = c("m", "s"), n.iter = 1000)
# Check out the head of sleep_sim_multi
head(sleep_sim_multi)
# Construct trace plots of the m and s chains
plot(sleep_sim_multi, density=FALSE)
# The mean of the m Markov chain provides an estimate of the posterior mean of m
# The naive standard error provides a measure of the estimate's accuracy.
# Suppose your goal is to estimate the posterior mean of m within a standard error of 0.1 ms
# If the observed naive standard error exceeds this target, no problem!
# You can simply run a longer chain
# SIMULATE the posterior
sleep_sim_1 <- coda.samples(model = sleep_jags, variable.names = c("m", "s"), n.iter = 1000)
# Summarize the m and s chains of sleep_sim_1
summary(sleep_sim_1)
# RE-SIMULATE the posterior
sleep_sim_2 <- coda.samples(model = sleep_jags, variable.names = c("m", "s"), n.iter = 10000)
# Summarize the m and s chains of sleep_sim_2
summary(sleep_sim_2)
# COMPILE the model
sleep_jags <- jags.model(textConnection(sleep_model), data = list(Y = sleep_study$diff_3), inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 1989))
# SIMULATE the posterior
sleep_sim <- coda.samples(model = sleep_jags, variable.names = c("m", "s"), n.iter = 10000)
# Summarize the m and s chains of sleep_sim
summary(sleep_sim)
Chapter 3 - Bayesian Inference and Prediction
Simple Bayesian Regression Model:
Bayesian Regression in RJAGS:
Y[i] ~ dnorm(m[i], s^(-2)) m[i] <- a + b * X[i] Posterior estimation and inference:
Posterior prediction:
Example code includes:
# Note the 3 parameters in the model of weight by height: intercept a, slope b, & standard deviation s
# In the first step of your Bayesian analysis, you will simulate the following prior models for these parameters: a ~ N(0, 200^2), b ~ N(1, 0.5^2), and s ~ Unif(0, 20)
# Take 10000 samples from the a, b, & s priors
prior_a <- rnorm(10000, 0, 200)
prior_b <- rnorm(10000, 1, 0.5)
prior_s <- runif(10000, 0, 20)
# Store samples in a data frame
samples <- data.frame(prior_a, prior_b, prior_s, set=1:10000)
# Construct density plots of the prior samples
ggplot(samples, aes(x = prior_a)) +
geom_density()
ggplot(samples, aes(x = prior_b)) +
geom_density()
ggplot(samples, aes(x = prior_s)) +
geom_density()
# Replicate the first 12 parameter sets 50 times each
prior_scenarios_rep <- bind_rows(replicate(n = 50, expr = samples[1:12, ], simplify = FALSE))
# Simulate 50 height & weight data points for each parameter set
prior_simulation <- prior_scenarios_rep %>%
mutate(height = rnorm(600, 170, 10)) %>%
mutate(weight = rnorm(600, prior_a + prior_b*height, prior_s))
# Plot the simulated data & regression model for each parameter set
ggplot(prior_simulation, aes(x = height, y = weight)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, size = 0.75) +
facet_wrap(~ set)
# The bdims data set from the openintro package is loaded in your workspace
# bdims contains physical measurements on a sample of 507 individuals, including their weight in kg (wgt) and height in cm (hgt)
# Construct a scatterplot of wgt vs hgt
ggplot(bdims, aes(x = hgt, y = wgt)) +
geom_point()
# Add a model smooth
ggplot(bdims, aes(x = hgt, y = wgt)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# Obtain the sample regression model
wt_model <- lm(wgt ~ hgt, data = bdims)
# Summarize the model
summary(wt_model)
# DEFINE the model
weight_model <- "model{
# Likelihood model for Y[i]
for(i in 1:length(Y)) {
Y[i] ~ dnorm(m[i], s^(-2))
m[i] <- a + b * X[i]
}
# Prior models for a, b, s
a ~ dnorm(0, 200^(-2))
b ~ dnorm(1, 0.5^(-2))
s ~ dunif(0, 20)
}"
# COMPILE the model
weight_jags <- jags.model(textConnection(weight_model), data = list(X=bdims$hgt, Y=bdims$wgt),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 1989))
# COMPILE the model
weight_jags <- jags.model(textConnection(weight_model), data = list(Y = bdims$wgt, X = bdims$hgt),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 1989))
# SIMULATE the posterior
weight_sim <- coda.samples(model = weight_jags, variable.names = c("a", "b", "s"), n.iter = 1000)
# PLOT the posterior
plot(weight_sim)
# A 100,000 iteration RJAGS simulation of the posterior, weight_sim_big, is in your workspace along with a data frame of the Markov chain output:
head(weight_chains, 2)
# The posterior means of the intercept & slope parameters, a & b, reflect the posterior mean trend in the relationship between weight & height
# In contrast, the full posteriors of a & b reflect the range of plausible parameters, thus posterior uncertainty in the trend
# You will examine the trend and uncertainty in this trend below
# The bdims data are in your workspace
# Summarize the posterior Markov chains
summary(weight_sim_big)
# Calculate the estimated posterior mean of b
mean(weight_chains$b)
# Plot the posterior mean regression model
ggplot(bdims, aes(x=hgt, y=wgt)) +
geom_point() +
geom_abline(intercept = mean(weight_chains$a), slope = mean(weight_chains$b), color = "red")
# Visualize the range of 20 posterior regression models
ggplot(bdims, aes(x=hgt, y=wgt)) +
geom_point() +
geom_abline(intercept = weight_chains$a[1:20], slope = weight_chains$b[1:20], color = "gray", size = 0.25)
# Summarize the posterior Markov chains
summary(weight_sim_big)
# Calculate the 95% posterior credible interval for b
quantile(weight_chains$b, c(0.025, 0.975))
# Calculate the 90% posterior credible interval for b
quantile(weight_chains$b, c(0.05, 0.95))
# Mark the 90% credible interval
ggplot(weight_chains, aes(x = b)) +
geom_density() +
geom_vline(xintercept = quantile(weight_chains$b, c(0.05, 0.95)), color = "red")
# Mark 1.1 on a posterior density plot for b
ggplot(weight_chains, aes(x=b)) +
geom_density() +
geom_vline(xintercept = 1.1, color = "red")
# Summarize the number of b chain values that exceed 1.1
table(weight_chains$b > 1.1)
# Calculate the proportion of b chain values that exceed 1.1
mean(weight_chains$b > 1.1)
# Calculate the trend under each Markov chain parameter set
weight_chains <- weight_chains %>%
mutate(m_180 = a + b*180)
# Construct a posterior density plot of the trend
ggplot(weight_chains, aes(x = m_180)) +
geom_density()
# Calculate the average trend
mean(weight_chains$m_180)
# Construct a posterior credible interval for the trend
quantile(weight_chains$m_180, c(0.025, 0.975))
# Simulate 1 prediction under the first parameter set
rnorm(1, mean=weight_chains$m_180[1], sd=weight_chains$s[1])
# Simulate 1 prediction under the second parameter set
rnorm(1, mean=weight_chains$m_180[2], sd=weight_chains$s[2])
# Simulate & store 1 prediction under each parameter set
weight_chains <- weight_chains %>%
mutate(Y_180=rnorm(nrow(weight_chains), mean=m_180, sd=s))
# Print the first 6 parameter sets & predictions
head(weight_chains)
# Construct a density plot of the posterior predictions
ggplot(weight_chains, aes(x=Y_180)) +
geom_density() +
geom_vline(xintercept = quantile(weight_chains$Y_180, c(0.025, 0.975)), color = "red")
# Construct a posterior credible interval for the prediction
quantile(weight_chains$Y_180, c(0.025, 0.975))
# Visualize the credible on a scatterplot of the data
ggplot(bdims, aes(x=hgt, y=wgt)) +
geom_point() +
geom_abline(intercept = mean(weight_chains$a), slope = mean(weight_chains$b), color = "red") +
geom_segment(x = 180, xend = 180, y = quantile(weight_chains$Y_180, c(0.025)), yend = quantile(weight_chains$Y_180, c(0.975)), color = "red")
Chapter 4 - Multivariate and Generalized Linear Models
Bayesian regression with categorical predictor:
Y[i] ~ dnorm(m[i], s^(-2)) m[i] <- a + b[X[i]] Multivariate Bayesian regression:
Y[i] ~ dnorm(m[i], s^(-2)) m[i] <- a + b[X[i]] + c * Z[i] Bayesian Poisson regression:
Y[i] ~ dpois(l[i]) log(l[i]) <- a + b[X[i]] + c*Z[i] Wrap up:
Example code includes:
# Confirm that weekday is a factor variable
is.factor(RailTrail$weekday)
# Construct a density plot of volume by weekday
ggplot(RailTrail, aes(x = volume, fill = weekday)) +
geom_density(alpha = 0.5)
# Calculate the mean volume on weekdays vs weekends
RailTrail %>%
group_by(weekday) %>%
summarize(mean(volume))
# DEFINE the model
rail_model_1 <- "model{
# Likelihood model for Y[i]
for(i in 1:length(Y)) {
Y[i] ~ dnorm(m[i], s^(-2))
m[i] <- a + b[X[i]]
}
# Prior models for a, b, s
a ~ dnorm(400, 100^(-2))
b[1] <- 0
b[2] ~ dnorm(0, 200^(-2))
s ~ dunif(0, 200)
}"
# COMPILE the model
rail_jags_1 <- jags.model(textConnection(rail_model_1),
data = list(Y=RailTrail$volume, X=RailTrail$weekday),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 10)
)
# COMPILE the model
rail_jags_1 <- jags.model(textConnection(rail_model_1), data = list(Y = RailTrail$volume, X = RailTrail$weekday),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 10))
# SIMULATE the posterior
rail_sim_1 <- coda.samples(model = rail_jags_1, variable.names = c("a", "b", "s"), n.iter = 10000)
# Store the chains in a data frame
rail_chains_1 <- data.frame(rail_sim_1[[1]])
# PLOT the posterior
plot(rail_sim_1)
# Posterior probability that typical volume is lower on weekdays
mean(rail_chains_1$'b.2.' < 0)
# Construct a chain of values for the typical weekday volume
rail_chains_1 <- rail_chains_1 %>%
mutate(weekday_mean = a + b.2.)
# Construct a density plot of the weekday chain
ggplot(rail_chains_1, aes(x=weekday_mean)) +
geom_density()
# 95% credible interval for typical weekday volume
quantile(rail_chains_1$weekday_mean, c(0.025, 0.975))
# Construct a plot of volume by hightemp & weekday
ggplot(RailTrail, aes(x=hightemp, y=volume, color=weekday)) +
geom_point()
# Construct a sample model
rail_lm <- lm(volume ~ weekday + hightemp, data=RailTrail)
# Summarize the model
summary(rail_lm)
# Superimpose sample estimates of the model lines
ggplot(RailTrail, aes(x=hightemp, y=volume, color=weekday)) +
geom_point() +
geom_abline(intercept = coef(rail_lm)["(Intercept)"], slope = coef(rail_lm)["hightemp"], color = "red") +
geom_abline(intercept = sum(coef(rail_lm)[c("(Intercept)", "weekdayTRUE")]), slope = coef(rail_lm)["hightemp"], color = "turquoise3")
# DEFINE the model
rail_model_2 <- "model{
# Likelihood model for Y[i]
for(i in 1:length(Y)){
Y[i] ~ dnorm(m[i], s^(-2))
m[i] <- a + b[X[i]] + c * Z[i]
}
# Prior models for a, b, c, s
a ~ dnorm(0, 200^(-2))
b[1] <- 0
b[2] ~ dnorm(0, 200^(-2))
c ~ dnorm(0, 20^(-2))
s ~ dunif(0, 200)
}"
# COMPILE the model
rail_jags_2 <- jags.model(textConnection(rail_model_2),
data = list(Y=RailTrail$volume, X=RailTrail$weekday, Z=RailTrail$hightemp),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 10)
)
# SIMULATE the posterior
rail_sim_2 <- coda.samples(model = rail_jags_2, variable.names = c("a", "b", "c", "s"), n.iter = 10000)
# Store the chains in a data frame
rail_chains_2 <- data.frame(rail_sim_2[[1]])
# PLOT the posterior
plot(rail_sim_2)
# Summarize the posterior Markov chains
summary(rail_sim_2)
# Plot the posterior mean regression models
ggplot(RailTrail, aes(x=hightemp, y=volume, color=weekday)) +
geom_point() +
geom_abline(intercept = mean(rail_chains_2[, "a"]), slope = mean(rail_chains_2[, "c"]), color = "red") +
geom_abline(intercept = mean(rail_chains_2[, "a"]) + mean(rail_chains_2[, "b.2."]), slope = mean(rail_chains_2[, "c"]), color = "turquoise3")
# Posterior probability that typical volume is lower on weekdays
mean(rail_chains_2$'b.2.' < 0)
# DEFINE the model
poisson_model <- "model{
# Likelihood model for Y[i]
for(i in 1:length(Y)) {
Y[i] ~ dpois(l[i])
log(l[i]) <- a + b[X[i]] + c * Z[i]
}
# Prior models for a, b, c
a ~ dnorm(0, 200^(-2))
b[1] <- 0
b[2] ~ dnorm(0, 2^(-2))
c ~ dnorm(0, 2^(-2))
}"
# COMPILE the model
poisson_jags <- jags.model(textConnection(poisson_model),
data = list(Y=RailTrail$volume, X=RailTrail$weekday, Z=RailTrail$hightemp),
inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = 10)
)
# SIMULATE the posterior
poisson_sim <- coda.samples(model = poisson_jags, variable.names = c("a", "b", "c"), n.iter = 10000)
# Store the chains in a data frame
poisson_chains <- data.frame(poisson_sim[[1]])
# PLOT the posterior
plot(poisson_sim)
# Summarize the posterior Markov chains
summary(poisson_sim)
# Plot the posterior mean regression models
ggplot(RailTrail, aes(x = hightemp, y = volume, color = weekday)) +
geom_point() +
stat_function(fun = function(x){exp(5.01352 + 0.01426 * x)}, color = "red") +
stat_function(fun = function(x){exp(5.01352 - 0.12800 + 0.01426 * x)}, color = "turquoise3")
# Calculate the typical volume on 80 degree weekends & 80 degree weekdays
poisson_chains <- poisson_chains %>%
mutate(l_weekend=exp(a + c*80)) %>%
mutate(l_weekday=exp(a + b.2. + c*80))
# Construct a 95% CI for typical volume on 80 degree weekend
quantile(poisson_chains$l_weekend, c(0.025, 0.975))
# Construct a 95% CI for typical volume on 80 degree weekday
quantile(poisson_chains$l_weekday, c(0.025, 0.975))
# Simulate weekend & weekday predictions under each parameter set
poisson_chains <- poisson_chains %>%
mutate(Y_weekend=rpois(nrow(poisson_chains), l_weekend)) %>%
mutate(Y_weekday=rpois(nrow(poisson_chains), l_weekday))
# Print the first 6 sets of parameter values & predictions
head(poisson_chains)
# Construct a density plot of the posterior weekday predictions
ggplot(poisson_chains, aes(x=Y_weekday)) +
geom_density()
# Posterior probability that weekday volume is less 400
mean(poisson_chains$Y_weekday < 400)
Chapter 1 - Can I run my application in parallel?
Partitioning problems in to independent pieces:
Models of parallel computing:
R packages for parallel computing:
Example code includes:
extract_words <- function(book_name) {
# extract the text of the book
text <- subset(austen_books(), book == book_name)$text
# extract words from the text and convert to lowercase
str_extract_all(text, boundary("word")) %>% unlist %>% tolower
}
janeausten_words <- function() {
# Names of the six books contained in janeaustenr
books <- austen_books()$book %>% unique %>% as.character
# Vector of words from all six books
words <- sapply(books, extract_words) %>% unlist
words
}
austen_books <- function ()
{
books <- list('Sense & Sensibility' = janeaustenr::sensesensibility,
'Pride & Prejudice' = janeaustenr::prideprejudice,
'Mansfield Park' = janeaustenr::mansfieldpark,
'Emma' = janeaustenr::emma,
'Northanger Abbey' = janeaustenr::northangerabbey,
'Persuasion' = janeaustenr::persuasion
)
ret <- data.frame(text = unlist(books, use.names = FALSE), stringsAsFactors = FALSE)
ret$book <- factor(rep(names(books), sapply(books, length)))
ret$book <- factor(ret$book, levels = unique(ret$book))
structure(ret, class = c("tbl_df", "tbl", "data.frame"))
}
max_frequency <- function(letter, words, min_length = 1) {
w <- select_words(letter, words = words, min_length = min_length)
frequency <- table(w)
frequency[which.max(frequency)]
}
select_words <- function(letter, words, min_length = 1) {
min_length_words <- words[nchar(words) >= min_length]
grep(paste0("^", letter), min_length_words, value = TRUE)
}
# Vector of words from all six books
words <- janeausten_words()
# Most frequent "a"-word that is at least 5 chars long
max_frequency(letter = "a", words = words, min_length = 5)
## again
## 1001
# Partitioning
result <- lapply(letters, FUN=max_frequency,
words = words, min_length = 5) %>% unlist()
# barplot of result
barplot(result, las = 2)
replicates <- 50
sample_size <- 10000
# Function that computes mean of normal random numbers
myfunc <- function(n, ...) mean(rnorm(n, ...))
# Init result, set seed & repeat the task sequentially
result <- rep(NA, replicates)
set.seed(123)
for(iter in 1:replicates) result[iter] <- myfunc(sample_size)
# View result
hist(result)
# Use sapply() with different distribution parameters
hist(sapply(rep(sample_size, replicates), FUN=myfunc, mean = 10, sd = 5))
# We'll now introduce a demographic model to be used throughout the course. It projects net migration rates via an AR(1) model, rate(t+1) - µ = ?(rate(t) -µ) + error with variance s2
# An MCMC estimation for the USA resulted in 1000 samples of parameters µ, ? and s
# The task is to project the future distribution of migration rates
ar1_trajectory <- function(est, rate0, len = 15) {
trajectory <- rep(NA, len)
rate <- rate0
for (time in seq_len(len)) {
trajectory[time] <- ar1(est, r = rate)
rate <- trajectory[time]
}
trajectory
}
ar1 <- function(est, r) {
est['mu'] + est['phi'] * (r - est['mu']) +
rnorm(1, sd = est['sigma'])
}
ar1_block <- function(id, rate0 = 0.015, traj_len = 15, block_size = 10) {
trajectories <- matrix(NA, nrow = block_size, ncol = traj_len)
for (i in seq_len(block_size))
trajectories[i,] <- ar1_trajectory(unlist(ar1est[id, ]), rate0 = rate0, len = traj_len)
trajectories
}
show_migration <- function(trajs) {
df <- data.frame(time = seq(2020, by = 5, len = ncol(trajs)),
migration_rate = apply(trajs, 2, median),
lower = apply(trajs, 2, quantile, 0.1),
upper = apply(trajs, 2, quantile, 0.9)
)
g <- ggplot(df, aes(x = time, y = migration_rate)) +
geom_ribbon(aes(ymin = lower, ymax = upper), fill = "grey70") +
geom_line()
print(g)
}
# Simulate from multiple rows of the estimation dataset
ar1_multblocks <- function(ids, ...) {
trajectories <- NULL
for (i in seq_along(ids)) {
trajectories <- rbind(trajectories, ar1_block(ids[i], ...))
}
trajectories
}
ar1est <- data.frame(mu=c(0.0105, 0.0185, 0.022, 0.0113, 0.0144, 0.0175, -9e-04, 0.0093, 0.0111, -9e-04, -0.0024, 0.0086, 0.012, 0.0161, 0.0043, 0.0175, 0.0118, 0.0019, 0.0116, 0.0048, 0.0154, 0.0137, 0.0168, 0.0191, 0.0108, -0.0037, 0.0135, 0.0203, -0.0042, 0.0097, 0.0209, 0.0034, 0.0113, 0.0102, 0.0094, -0.0012, 0.008, 0.0082, 0.0123, 0.0175, 0.0054, -0.0087, 0.0161, 0.0155, 0.0126, 0.0181, 0.014, -0.0135, -0.0095, 0.0142, 0.011, 0.0194, 0.0149, 0.0115, 0.0129, -0.0124, 0.0116, 0.0136, 0.0161, 0.005, 0.0165, -0.0079, 0.0129, -0.0016, -7e-04, 0.0243, 0.0193, -0.004, 0.0145, 0.0078, 0.0156, 0.001, 0.0032, 0.0069, 0.0146, 0.0164, 0.0113, 0.0116, 0.0182, 0.0167, -0.0031, 0.0168, 0.0137, 0.012, -0.0212, -0.0092, 0.019, 0.0167, -0.0021, 0.0156, 0.0173, 0.0148, -0.0036, 0.0168, 0.0179, 0.0086, 0.0131, 0.015, 0.0106, 0.0132, 0.0119, 0.0156, 0.0159, 0.0256, 0.0071, 0.0163, 0.0107, 0.0139, 0.0228, 0.0139, 0.0117, 0.0133, 0.0127, -0.0162, 0.0115, 0.0095, 0.0183, 0.0183, -6e-04, 0.0177, 0.0145, 0.0041, 0.0143, 0.0135, -0.0078, 0.0036, 0.015, 0.018, 0.0158, 0.0054, -0.0204, 0.0193, 0.0051, 0.0144, 0.0129, 0.0134, 0.0116, 0.0102, 0.0203, 0.0154, 0.0106, 0.0184, 0.0096, -0.0032, 0.0143, 0.0158, 0.0093, 0.0159, 0.0112, 0.0106, 0.0075, 0.0133, 0.0171, 0.0133, 0.0139, 0.0167, 0.0131, -0.0078, 0.0135, 0.0145, 0.0104, 8e-04, 0.0205, 0.0046, 0.011, 0.0148, 0.0202, 8e-04, 0.0211, 0.0135, -8e-04, -0.0104, -0.0027, 0.0094, 0.0179, -0.0101, 0.0156, 0.0155, 0.014, 0.0149, 0.0165, 0.0168, 0.0155, 0.0136, 0.0156, 0.0149, 0.0191, 0.0176, 0.0094, -0.0076, 0.0162, 0.0143, 0.0182, 0.0102, 0.015, -0.0292, 0.0063, -0.0028, 0.0163, 0.015),
sigma=c(0.0081, 0.0053, 0.0069, 0.0075, 0.0082, 0.006, 0.0101, 0.011, 0.0064, 0.0066, 0.0095, 0.0057, 0.0078, 0.005, 0.0076, 0.0064, 0.0067, 0.0049, 0.0086, 0.0067, 0.0063, 0.0054, 0.0063, 0.0077, 0.0072, 0.0074, 0.0067, 0.0047, 0.0125, 0.0069, 0.0052, 0.0073, 0.0063, 0.0072, 0.0086, 0.0079, 0.009, 0.006, 0.0077, 0.0061, 0.0082, 0.0072, 0.0054, 0.0056, 0.0072, 0.0085, 0.0064, 0.0058, 0.0064, 0.0084, 0.0075, 0.006, 0.0048, 0.0068, 0.0065, 0.0082, 0.0072, 0.0056, 0.0056, 0.0055, 0.0054, 0.0059, 0.0064, 0.0069, 0.0073, 0.0071, 0.0057, 0.0062, 0.0086, 0.0062, 0.0054, 0.0052, 0.0066, 0.0076, 0.0046, 0.0056, 0.0066, 0.0077, 0.0074, 0.0061, 0.0056, 0.0065, 0.0069, 0.0084, 0.0058, 0.007, 0.0074, 0.0077, 0.0081, 0.0083, 0.0054, 0.0057, 0.0076, 0.0119, 0.0056, 0.0078, 0.005, 0.0073, 0.0075, 0.0054, 0.0085, 0.011, 0.0063, 0.0056, 0.009, 0.0069, 0.008, 0.0063, 0.007, 0.0059, 0.0064, 0.006, 0.0103, 0.0085, 0.006, 0.0076, 0.0054, 0.0066, 0.0056, 0.0071, 0.0079, 0.007, 0.0085, 0.0075, 0.007, 0.0085, 0.006, 0.0067, 0.006, 0.0074, 0.0098, 0.0066, 0.0058, 0.0075, 0.0064, 0.0059, 0.0103, 0.0055, 0.0053, 0.0068, 0.0057, 0.009, 0.0118, 0.0096, 0.0085, 0.0075, 0.0078, 0.0041, 0.0056, 0.008, 0.0071, 0.006, 0.0046, 0.0061, 0.007, 0.0061, 0.0066, 0.0075, 0.0094, 0.0072, 0.008, 0.0064, 0.0079, 0.0068, 0.0069, 0.0058, 0.0056, 0.0057, 0.0065, 0.006, 0.0073, 0.0067, 0.0068, 0.0071, 0.0048, 0.0071, 0.0063, 0.0051, 0.0079, 0.0042, 0.0048, 0.0066, 0.0072, 0.0058, 0.0057, 0.0083, 0.0063, 0.0057, 0.0103, 0.0096, 0.0067, 0.0051, 0.0075, 0.0064, 0.0069, 0.007, 0.007, 0.0074, 0.0056, 0.006),
phi=c(0.42, 0.3509, 0.8197, 0.5304, 0.1491, 0.3675, 0.9687, 0.7877, 0.7114, 0.9435, 0.9634, 0.9189, 0.4758, 0.5738, 0.8016, 0.0509, 0.8281, 0.8168, 0.7442, 0.9347, 0.1699, 0.3566, 0.8388, 0.7724, 0.7474, 0.7834, 0.6661, 0.5162, 0.9025, 0.5306, 0.6912, 0.7625, 0.8289, 0.6985, 0.9188, 0.9639, 0.3178, 0.7288, 0.4129, 0.2196, 0.9304, 0.9697, 0.193, 0.1474, 0.3111, 0.8844, 0.7386, 0.9674, 0.9983, 0.4863, 0.9338, 0.7999, 0.4696, 0.5078, 0.5141, 0.9958, 0.6404, 0.2886, 0.4171, 0.9856, 0.3261, 0.9713, 0.682, 0.7686, 0.8577, 0.9481, 0.6057, 0.934, 0.3161, 0.9414, 0.8349, 0.8325, 0.8913, 0.7726, 0.7327, 0.1403, 0.8144, 0.7506, 0.225, 0.4884, 0.9052, 0.2891, 0.1652, 0.7612, 0.9403, 0.9865, 0.4107, 0.6518, 0.893, 0.4981, 0.72, 0.3366, 0.8437, 0.2551, 0.7753, 0.5, 0.7857, 0.7107, 0.5643, 0.2887, 0.9621, 0.2384, 0.414, 0.86, 0.6917, 0.4946, 0.2325, 0.3419, 0.9219, 0.2706, 0.717, 0.2327, 0.7541, 0.9692, 0.5838, 0.9346, 0.4739, 0.3219, 0.9634, 0.3046, 0.9913, 0.8485, 0.3071, 0.0373, 0.9183, 0.7935, 0.0039, 0.5968, 0.3654, 0.595, 0.9712, 0.2745, 0.6027, 0.7441, 0.7641, 0.3582, 0.3397, 0.7748, 0.8188, 0.0604, 0.5076, 0.2856, 0.6859, 0.6705, 0.0326, 0.8749, 0.2596, 0.1138, 0.6072, 0.4, 0.9241, 0.612, 0.2375, 0.2495, 0.0661, 0.3234, 0.7651, 0.8581, 0.4818, 0.7303, 0.7458, 0.8925, 0.2861, 0.982, 0.0791, 0.2474, 0.4326, 0.8757, 0.5288, 0.6476, 0.8473, 0.9098, 0.9562, 0.8464, 0.5444, 0.9738, 0.706, 0.0795, 0.391, 0.3167, 0.3311, 0.5681, 0.27, 0.9046, 0.2299, 0.2299, 0.085, 0.4002, 0.7443, 0.9865, 0.7028, 0.9016, 0.6092, 0.2367, 0.5402, 0.9401, 0.8013, 0.993, 0.2473, 0.6414)
)
str(ar1est)
## 'data.frame': 200 obs. of 3 variables:
## $ mu : num 0.0105 0.0185 0.022 0.0113 0.0144 0.0175 -0.0009 0.0093 0.0111 -0.0009 ...
## $ sigma: num 0.0081 0.0053 0.0069 0.0075 0.0082 0.006 0.0101 0.011 0.0064 0.0066 ...
## $ phi : num 0.42 0.351 0.82 0.53 0.149 ...
# Generate trajectories for all rows of the estimation dataset
trajs <- ar1_multblocks(seq_along(nrow(ar1est)), rate0 = 0.015, block_size = 10, traj_len = 15)
# Show results
show_migration(trajs)
# Load package
library(parallel)
# How many physical cores are available?
ncores <- detectCores(logical = FALSE)
# Create a cluster
cl <- makeCluster(ncores)
# Process rnorm in parallel
clusterApply(cl, 1:ncores, fun = rnorm, mean = 10, sd = 2)
## [[1]]
## [1] 8.99691
##
## [[2]]
## [1] 9.702904 7.613981
# Evaluate partial sums in parallel
part_sums <- clusterApply(cl, x = c(1, 51), fun = function(x) sum(x:(x + 49)))
# Total sum
total <- sum(unlist(part_sums))
# Check for correctness
total == sum(1:100)
## [1] TRUE
# Stop the cluster
stopCluster(cl)
# Create a cluster and set parameters
cl <- makeCluster(2)
replicates <- 50
sample_size <- 10000
# Parallel evaluation
means <- clusterApply(cl, x = rep(sample_size, replicates), fun = myfunc)
# View results as histogram
hist(unlist(means))
Chapter 2 - The parallel package
Cluster basics:
Core of parallel:
Initialization of nodes:
Subsetting data:
Example code includes:
# Load parallel and create a cluster
library(parallel)
cl <- makeCluster(4)
# Investigate the cl object and its elements
typeof(cl)
## [1] "list"
length(cl)
## [1] 4
typeof(cl[[3]])
## [1] "list"
cl[[3]]$rank
## [1] 3
# What is the process ID of the workers
clusterCall(cl, Sys.getpid)
## [[1]]
## [1] 29644
##
## [[2]]
## [1] 19332
##
## [[3]]
## [1] 4196
##
## [[4]]
## [1] 32452
# Stop the cluster
stopCluster(cl)
# Define ncores and a print function
ncores <- 2
print_ncores <- function() print(ncores)
# Create a socket and a fork clusters
# cl_sock <- makeCluster(ncores, type = "PSOCK")
# cl_fork <- makeCluster(ncores, type = "FORK") # this is possible only on OS other than Windows
# Evaluate the print function on each cluster
# clusterCall(cl_sock, print_ncores) # this will fail since the socket has no knowledge of the main environment
# clusterCall(cl_fork, print_ncores)
# Change ncores and evaluate again
# ncores <- 4
# clusterCall(cl_fork, print_ncores) # the fork is only of the original environment, so these clusters will still think the answer is 2
# In this exercise, you will take the simple embarrassingly parallel application for computing mean of random numbers (myfunc()) from the first chapter, and implement two functions:
# One that runs the application sequentially, mean_seq(), and one that runs it in parallel, mean_par()
# Both functions have three arguments, n (sample size), repl (number of replicates) and ... (passed to myfunc())
# Function mean_par() assumes a cluster object cl to be present in the global environment
# Function to run repeatedly
myfunc <- function(n, ...) mean(rnorm(n, ...))
# Sequential solution
mean_seq <- function(n, repl, ...) {
res <- rep(NA, repl)
for (it in 1:repl) res[it] <- myfunc(n, ...)
res
}
# Parallel solution
mean_par <- function(n, repl, ...) {
res <- clusterApply(cl, x = rep(n, repl), fun = myfunc, ...)
unlist(res)
}
# Load packages
library(parallel)
library(microbenchmark)
# Create a cluster
cl <- makeCluster(2)
# Compare run times
microbenchmark(mean_seq(3000000, repl = 4),
mean_par(3000000, repl = 4),
mean_seq(100, repl = 100),
mean_par(100, repl = 100),
times = 1, unit = "s")
## Unit: seconds
## expr min lq mean median
## mean_seq(3e+06, repl = 4) 1.793497373 1.793497373 1.793497373 1.793497373
## mean_par(3e+06, repl = 4) 0.962910477 0.962910477 0.962910477 0.962910477
## mean_seq(100, repl = 100) 0.003501865 0.003501865 0.003501865 0.003501865
## mean_par(100, repl = 100) 0.201625045 0.201625045 0.201625045 0.201625045
## uq max neval
## 1.793497373 1.793497373 1
## 0.962910477 0.962910477 1
## 0.003501865 0.003501865 1
## 0.201625045 0.201625045 1
# Stop cluster
stopCluster(cl)
# Load extraDistr on master
library(extraDistr)
##
## Attaching package: 'extraDistr'
## The following object is masked from 'package:purrr':
##
## rdunif
# Define myrdnorm
myrdnorm <- function(n, mean = 0, sd = 1)
rdnorm(n, mean = mean, sd = sd)
# Run myrdnorm in parallel - should fail
# res <- clusterApply(cl, rep(1000, 20), myrdnorm, sd = 6) # will error out
# Load extraDistr on all workers
cl <- makeCluster(2)
clusterEvalQ(cl, library(extraDistr))
## [[1]]
## [1] "extraDistr" "stats" "graphics" "grDevices" "utils"
## [6] "datasets" "methods" "base"
##
## [[2]]
## [1] "extraDistr" "stats" "graphics" "grDevices" "utils"
## [6] "datasets" "methods" "base"
# Run myrdnorm in parallel again and show results
res <- clusterApply(cl, rep(1000, 20), myrdnorm, sd = 6)
hist(unlist(res))
# myrdnorm that uses global variables
myrdnorm <- function(n) rdnorm(n, mean = mean, sd = sd)
# Initialize workers
clusterEvalQ(cl, {
library(extraDistr)
mean=10
sd=5
})
## [[1]]
## [1] 5
##
## [[2]]
## [1] 5
# Run myrdnorm in parallel and show results
res <- clusterApply(cl, rep(1000, 100), myrdnorm)
# View results
hist(unlist(res))
# Set global objects on master
mean <- 20
sd <- 10
# Export global objects to workers
clusterExport(cl, c("mean", "sd"))
# Load extraDistr on workers
clusterEvalQ(cl, library(extraDistr))
## [[1]]
## [1] "extraDistr" "stats" "graphics" "grDevices" "utils"
## [6] "datasets" "methods" "base"
##
## [[2]]
## [1] "extraDistr" "stats" "graphics" "grDevices" "utils"
## [6] "datasets" "methods" "base"
# Run myrdnorm in parallel and show results
res <- clusterApply(cl, rep(1000, 100), myrdnorm)
hist(unlist(res))
select_words <- function(letter, words, min_length = 1) {
min_length_words <- words[nchar(words) >= min_length]
grep(paste0("^", letter), min_length_words, value = TRUE)
}
# Export "select_words" to workers
clusterExport(cl, "select_words")
# Split indices for two chunks
ind <- splitIndices(length(words), 2)
# Find unique words in parallel
result <- clusterApply(cl, x = list(words[ind[[1]]], words[ind[[2]]]),
function(w, ...) unique(select_words("v", w, ...)),
min_length = 10)
# Show vectorized unique results
unique(unlist(result))
## [1] "voluntarily" "variations" "vindication" "violoncello"
## [5] "vouchsafed" "veneration" "volatility" "volubility"
## [9] "vigorously" "villainous" "vindicating" "vulnerable"
## [13] "vicissitudes" "vegetation" "vulgarisms" "valetudinarian"
## [17] "vindicated" "vouchsafing" "voluminous" "vehemently"
## [21] "valancourt" "venerating" "viscountess" "vanquished"
# Earlier you defined a function ar1_multblocks() that takes a vector of row identifiers as argument and generates migration trajectories using the corresponding rows of the parameter set ar1est
# ar1_multblocks() depends on ar1_block() which in turns depends on ar1_trajectory()
# These functions along with the cluster object cl of size 4, function show_migration(), the dataset ar1est (reduced to 200 rows) and packages parallel and ggplot2 are available in your workspace
ar1_block <- function(id, rate0 = 0.015, traj_len = 15, block_size = 10) {
trajectories <- matrix(NA, nrow = block_size, ncol=traj_len)
for (i in seq_len(block_size))
trajectories[i,] <- ar1_trajectory(unlist(ar1est[id, ]), rate0 = rate0, len = traj_len)
trajectories
}
ar1_trajectory <- function(est, rate0, len = 15) {
ar1 <- function(est, r) {
# simulate one AR(1) value
est['mu'] + est['phi'] * (r - est['mu']) +
rnorm(1, sd = est['sigma'])
}
trajectory <- rep(NA, len)
rate <- rate0
for (time in seq_len(len)) {
trajectory[time] <- ar1(est, r = rate)
rate <- trajectory[time]
}
trajectory
}
ar1_multblocks <- function(ids, ...) {
trajectories <- NULL
for (i in seq_along(ids))
trajectories <- rbind(trajectories, ar1_block(ids[i], ...))
trajectories
}
# Export data and functions
clusterExport(cl, c("ar1est", "ar1_block", "ar1_trajectory"))
# Process ar1_multblocks in parallel
res <- clusterApply(cl, 1:nrow(ar1est), ar1_multblocks)
# Combine results into a matrix and show results
trajs <- do.call(rbind, res)
show_migration(trajs)
# The object res returned by clusterApply() in the previous exercise is also in your workspace, now called res_prev
res_prev <- res
# Split task into 5 chunks
ind <- splitIndices(nrow(ar1est), 5)
# Process ar1_multblocks in parallel
res <- clusterApply(cl, ind, ar1_multblocks)
# Dimensions of results
(res_dim <- c(length(res), nrow(res[[1]])))
## [1] 5 400
(res_prev_dim <- c(length(res_prev), nrow(res_prev[[1]])))
## [1] 200 10
stopCluster(cl)
Chapter 3 - foreach, future.apply, and Load Balancing
foreach:
foreach and parallel backends:
future and future.apply - packages that are continually under development:
Load balancing and scheduling:
*, 100, future.scheduling = 1) # 1 chunk per worker*, 100, future.scheduling = FALSE) # 1 chunk per taskExample code includes:
# Recall the first chapter where you found the most frequent words from the janeaustenr package that are of certain minimum length
result <- lapply(letters, max_frequency, words = words, min_length = 5) %>%
unlist
# In this exercise, you will implement the foreach construct to solve the same problem
# The janeaustenr package, a vector of all words from the included books, words, and a function max_frequency() for finding the results based on a given starting letter are all available in your workspace
# Load the package
library(foreach)
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
# foreach construct
result <- foreach(l = letters, .combine=c) %do% max_frequency(l, words=words, min_length=5)
# Plot results
barplot(result, las = 2)
# Specifically, your job is to modify the code so that the maximum frequency for the first half of the alphabet is obtained for words that are two and more characters long, while the frequency corresponding to the second half of the alphabet is derived from words that are six and more characters long
# Note that we are using an alphabet of 26 characters
# foreach construct and combine into vector
result <- foreach(l = letters, n = rep(c(2, 6), each=13), .combine = c) %do%
max_frequency(l, words=words, min_length=n)
# Plot results
barplot(result, las = 2)
# Register doParallel with 2 cores
doParallel::registerDoParallel(cores=2)
# Parallel foreach loop
res <- foreach(r = rep(1000, 100), .combine = rbind,
.packages = "extraDistr") %dopar% myrdnorm(r)
# Dimensions of res
dim_res <- dim(res)
# So far you learned how to search for the most frequent word in a text sequentially using foreach()
# In the course of the next two exercises, you will implement the same task using doParallel and doFuture for parallel processing and benchmark it against the sequential version
# The sequential solution is implemented in function freq_seq() (type freq_seq in your console to see it)
# It iterates over a global character vector chars and calls the function max_frequency() which searches within a vector of words, while filtering for minimum word length
# All these objects are preloaded, as is the doParallel package
# Your job now is to write a function freq_doPar() that runs the same code in parallel via doParallel
freq_seq <- function(min_length = 5)
foreach(l = letters, .combine = c) %do%
max_frequency(l, words = words, min_length = min_length)
# Function for doParallel foreach
freq_doPar <- function(cores, min_length = 5) {
# Register a cluster of size cores
doParallel::registerDoParallel(cores=cores)
# foreach loop
foreach(l=letters, .combine=c,
.export = c("max_frequency", "select_words", "words"),
.packages = c("janeaustenr", "stringr")) %dopar%
max_frequency(l, words=words, min_length=min_length)
}
# Run on 2 cores
freq_doPar(cores=2)
## again being could darcy every first great
## 1001 1445 3613 373 1456 972 981
## herself indeed jennings knightley little might never
## 1360 664 199 356 1295 1369 1362
## other place quite really should there under
## 1084 503 870 504 1541 2209 293
## visit would xviii young zealous
## 294 3238 4 766 5
# Now your job is to create a function freq_doFut() that accomplishes the same task as freq_doPar() but with the doFuture backend
# Note that when using doFuture, arguments .packages and .export in foreach() are not necessary, as the package deals with the exports automatically
# You will then benchmark these two functions, together with the sequential freq_seq()
# All the functions from the last exercise are available in your workspace
# In addition, the packages doFuture and microbenchmark are also preloaded
# To keep the computation time low, the global chars vector is set to the first six letters of the alphabet only
cores <- 2
min_length <- 5
# Error in tweak.function(strategy, ..., penvir = penvir) :
# Trying to use non-future function 'survival::cluster': function (x) { ... }
# For solution see https://github.com/HenrikBengtsson/future/issues/152
# Function for doFuture foreach
freq_doFut <- function(cores, min_length = 5) {
# Register and set plan
doFuture::registerDoFuture()
future::plan(future::cluster, workers=cores)
# foreach loop
foreach(l = letters, .combine = c) %dopar%
max_frequency(l, words = words, min_length = min_length)
}
# Benchmark
microbenchmark(freq_seq(min_length),
freq_doPar(cores, min_length),
freq_doFut(cores, min_length),
times = 1)
## Unit: seconds
## expr min lq mean median
## freq_seq(min_length) 9.812705 9.812705 9.812705 9.812705
## freq_doPar(cores, min_length) 12.176340 12.176340 12.176340 12.176340
## freq_doFut(cores, min_length) 11.597570 11.597570 11.597570 11.597570
## uq max neval
## 9.812705 9.812705 1
## 12.176340 12.176340 1
## 11.597570 11.597570 1
# It is straight forward to swap parallel backends with foreach
# In this small example, you might not see any time advantage in running it in parallel
# In addition, doFuture is usually somewhat slower than doParallel
# This is because doFuture has a higher computation overhead
# We encourage you to test these frameworks on more time-consuming applications where an overhead become negligible relative to the overall processing time
extract_words_from_text <- function(text) {
str_extract_all(text, boundary("word")) %>%
unlist %>%
tolower
}
# Main function
freq_fapply <- function(words, chars=letters, min_length=5) {
unlist(future.apply::future_lapply(chars, FUN=max_frequency, words = words, min_length = min_length))
}
obama <- readLines("./RInputFiles/obama.txt")
obama_speech <- paste(obama[obama != ""], collapse=" ")
# Extract words and call freq_fapply
words <- extract_words_from_text(obama_speech)
res <- freq_fapply(words)
# Plot results
barplot(res, las = 2)
# Now imagine you are a user of the fictional package from the previous exercise
# At home you have a two-CPU Mac computer, and at work you use a Linux cluster with two 16-CPU computers, called "oisin" and "oscar"
# Your job is to write a function for each of the hardware that calls freq_fapply() while taking advantage of all available CPUs
# For the cluster, you set workers to a vector of computer names corresponding to the number of CPUs, i.e. 16 x "oisin" and 16 x "oscar"
# For a one-CPU environment, we have created a function fapply_seq()
# fapply_seq <- function(...) {
# future::plan(strategy="sequential")
# freq_fapply(words, letters, ...)
# }
# multicore function
# fapply_mc <- function(cores=2, ...) {
# plan(strategy="multicore", workers=cores)
# freq_fapply(words, letters, ...)
# }
# cluster function
# fapply_cl <- function(cores=NULL, ...) {
# # set default value for cores
# if(is.null(cores))
# cores <- rep(c("oisin", "oscar"), each = 16)
#
# # parallel processing
# plan(strategy="cluster", workers=cores)
# freq_fapply(words, letters, ...)
# }
# Note: Multicore does not work on Windows. We recommend using the 'multiprocess' or 'cluster' plan on Windows computers.
# Microbenchmark
# microbenchmark(fapply_seq = fapply_seq(),
# fapply_mc_2 = fapply_mc(cores=2),
# fapply_mc_10 = fapply_mc(cores=10),
# fapply_cl = fapply_cl(cores=2),
# times = 1)
# Which is the slowest?
# slowest1 <- "fapply_cl"
# This is because for a small number of tasks a sequential code can run faster than a parallel version due to the parallel overhead
# The cluster plan has usually the largest overhead and thus, should be used only for larger number of tasks
# The multicore may be more efficient when the number of workers is equal to the number of cores
# It uses shared memory, and thus is faster than cluster
# In your workspace there is a vector tasktime containing simulated processing times of 30 tasks (generated using runif())
# There is also a cluster object cl with two nodes
# Your job is to apply the function Sys.sleep() to tasktime in parallel using clusterApply() and clusterApplyLB() and benchmark them
# The parallel and microbenchmark packages are loaded
# We also provided functions for plotting cluster usage plots called plot_cluster_apply() and plot_cluster_applyLB()
# Both functions use functionality from the snow package
tasktime <- c(0.1328, 0.1861, 0.2865, 0.4541, 0.1009, 0.4492, 0.4723, 0.3304, 0.3146, 0.031, 0.1031, 0.0884, 0.3435, 0.1921, 0.3849, 0.2489, 0.3588, 0.496, 0.1901, 0.3887, 0.4674, 0.1062, 0.3259, 0.0629, 0.1337, 0.1931, 0.0068, 0.1913, 0.4349, 0.1702)
# plot_cluster_apply <- function(cl, x, fun)
# plot(snow::snow.time(snow::clusterApply(cl, x, fun)),
# title = "Cluster usage of clusterApply")
# plot_cluster_applyLB <- function(cl, x, fun)
# plot(snow::snow.time(snow::clusterApplyLB(cl, x, fun)),
# title = "Cluster usage of clusterApplyLB")
# Benchmark clusterApply and clusterApplyLB
# microbenchmark(
# clusterApply(cl, tasktime, Sys.sleep),
# clusterApplyLB(cl, tasktime, Sys.sleep),
# times = 1
# )
# Plot cluster usage
# plot_cluster_apply(cl, tasktime, Sys.sleep)
# plot_cluster_applyLB(cl, tasktime, Sys.sleep)
# Now we compare the results from the previous exercise with ones generated using parSapply(), which represents here an implementation that groups tasks into as many chunks as there are workers available
# You first explore its cluster usage plot, using the function plot_parSapply() we defined for you
# We generated a version of the tasktime vector, called bias_tasktime that generates very uneven load
# Your job is to compare the run times of parSapply() with clusterApplyLB() applied to bias_tasktime
# plot_parSapply <- function(cl, x, fun)
# plot(snow::snow.time(snow::parSapply(cl, x, fun)),
# title = "Cluster usage of parSapply")
# bias_tasktime <- c(1, 1, 1, 0.1, 0.1, 0.1, 1e-04, 1e-04, 1e-04, 0.001, 1)
# Plot cluster usage for parSapply
# plot_parSapply(cl, tasktime, Sys.sleep)
# Microbenchmark
# microbenchmark(
# clusterApplyLB(cl, bias_tasktime, Sys.sleep),
# parSapply(cl, bias_tasktime, Sys.sleep),
# times = 1
# )
# Plot cluster usage for parSapply and clusterApplyLB
# plot_cluster_applyLB(cl, bias_tasktime, Sys.sleep)
# plot_parSapply(cl, bias_tasktime, Sys.sleep)
Chapter 4 - Random Numbers and Reproducibility
Are results reproducible?
Parallel random number generators:
Reproducibility in foreach and future.apply:
Next steps:
Example code includes:
# In addition to the code in the previous exercise, we also created a FORK cluster for you.
# cl.fork <- makeCluster(2, type = "FORK")
# Your job is to register the two cluster objects with the preloaded doParallel package and compare results obtained with parallel foreach
# How do the results differ in terms of reproducibility?
library(doParallel)
## Loading required package: iterators
cl.sock <- makeCluster(2, type = "PSOCK")
registerDoParallel(cl.sock)
set.seed(100)
foreach (i = 1:2) %dopar% rnorm(3)
## [[1]]
## [1] -1.2969535 -0.7594659 -0.1774645
##
## [[2]]
## [1] 0.3385876 0.3487904 1.7484280
# Register and use cl.sock
registerDoParallel(cl.sock)
replicate(2, {
set.seed(100)
foreach(i = 1:2, .combine = rbind) %dopar% rnorm(3)
}, simplify = FALSE
)
## [[1]]
## [,1] [,2] [,3]
## result.1 0.6411187 0.9961642 1.1951045
## result.2 1.2780694 0.9840492 -0.3302664
##
## [[2]]
## [,1] [,2] [,3]
## result.1 -0.2018902 0.4543543 0.848828
## result.2 -2.9367624 -1.3175043 0.644850
# Register and use cl.fork
# registerDoParallel(cl.fork)
# replicate(2, {
# set.seed(100)
# foreach(i = 1:2, .combine = rbind) %dopar% rnorm(3)
# }, simplify = FALSE
# )
# Create a cluster
cl <- makeCluster(2)
# Check RNGkind on workers
clusterCall(cl, RNGkind)
## [[1]]
## [1] "Mersenne-Twister" "Inversion"
##
## [[2]]
## [1] "Mersenne-Twister" "Inversion"
# Set the RNG seed on workers
clusterSetRNGStream(cl, iseed=100)
# Check RNGkind on workers
clusterCall(cl, RNGkind)
## [[1]]
## [1] "L'Ecuyer-CMRG" "Inversion"
##
## [[2]]
## [1] "L'Ecuyer-CMRG" "Inversion"
# Now you are ready to make your results reproducible
# You will use the simple embarrassingly parallel application for computing a mean of random numbers (myfunc) which we parallelized in the second chapter using clusterApply()
# The parallel package, myfunc() , n (sample size, set to 1000) and repl (number of replicates, set to 5) are available in your workspace
# You will now call clusterApply() repeatedly to check if results can be reproduced, without and with initializing the RNG
n <- 1000
repl <- 5
# Create a cluster of size 2
cl <- makeCluster(2)
# Call clusterApply three times
for(i in 1:3)
print(unlist(clusterApply(cl, rep(n, repl), myfunc)))
## [1] -0.01501670 0.04839363 -0.04065913 0.03334212 -0.02679950
## [1] -0.0538564821 -0.0262389749 0.0006322522 -0.0441563568 0.0019852047
## [1] -0.031270802 0.002946511 -0.021341988 -0.039455055 0.004148929
# Create a seed object
seed <- 1234
# Repeatedly set the cluster seed and call clusterApply()
for(i in 1:3) {
clusterSetRNGStream(cl, iseed = seed)
print(unlist(clusterApply(cl, rep(n, repl), myfunc)))
}
## [1] -0.008597904 -0.006089337 -0.013980519 -0.066293388 0.004297755
## [1] -0.008597904 -0.006089337 -0.013980519 -0.066293388 0.004297755
## [1] -0.008597904 -0.006089337 -0.013980519 -0.066293388 0.004297755
# Create two cluster objects, of size 2 and 4
cl2 <- makeCluster(2)
cl4 <- makeCluster(4)
# Set seed on cl2 and call clusterApply
clusterSetRNGStream(cl2, iseed = seed)
unlist(clusterApply(cl2, rep(n, repl), myfunc))
## [1] -0.008597904 -0.006089337 -0.013980519 -0.066293388 0.004297755
# Set seed on cl4 and call clusterApply
clusterSetRNGStream(cl4, iseed = seed)
unlist(clusterApply(cl4, rep(n, repl), myfunc))
## [1] -0.008597904 -0.006089337 0.077876985 -0.072012937 -0.013980519
# Register doParallel and doRNG
library(doRNG)
## Loading required package: rngtools
## Loading required package: pkgmaker
## Loading required package: registry
##
## Attaching package: 'pkgmaker'
## The following object is masked from 'package:base':
##
## isFALSE
registerDoParallel(cores = 2)
doRNG::registerDoRNG(seed)
# Call ar1_block via foreach
mpar <- foreach(r=1:5) %dopar% ar1_block(r)
# Register sequential backend, set seed and run foreach
registerDoSEQ()
set.seed(seed)
mseq <- foreach(r=1:5) %dorng% ar1_block(r)
# Check if results identical
identical(mpar, mseq)
## [1] TRUE
# You are able to reproduce sequential and parallel applications! Remember to always use %dorng% if you use the doSEQ backend
# Also note that by default on the Linux DataCamp server, registerDoParallel() creates a FORK cluster if a number of cores is passed to it
# As a result, there was no need to export any functions to workers, as they were copied from the master
# On a different platform, the .export option may be needed
# Set multiprocess plan
future::plan(strategy="multiprocess", workers = 2)
# Call ar1_block via future_lapply
mfpar <- future.apply::future_lapply(1:5, FUN=ar1_block, future.seed=seed)
# Set sequential plan and repeat future_lapply
future::plan(strategy="sequential")
mfseq <- future.apply::future_lapply(1:5, FUN=ar1_block, future.seed=seed)
# Check if results are identical
identical(mfpar, mfseq)
## [1] TRUE
rm(mean)
rm(sd)
Chapter 1 - Quickstart Guide
Why choice?
Inspecting choice data:
Fitting and interpreting a choice model:
Using choice models to make decisions:
Example code includes:
# Unload conflicting namespaces
unloadNamespace("rms")
unloadNamespace("quantreg")
unloadNamespace("MatrixModels")
unloadNamespace("lmerTest")
unloadNamespace("semPlot")
unloadNamespace("rockchalk")
unloadNamespace("qgraph")
unloadNamespace("sem")
unloadNamespace("mi")
unloadNamespace("arm")
unloadNamespace("mice")
unloadNamespace("mitml")
unloadNamespace("jomo")
unloadNamespace("arm")
unloadNamespace("jomo")
unloadNamespace("lme4")
# load the mlogit library
library(mlogit)
scLong <- read.csv("./RInputFiles/sportscar_choice_long.csv")
scWide <- read.csv("./RInputFiles/sportscar_choice_wide.csv")
sportscar <- scLong
sportscar$alt <- as.factor(sportscar$alt)
sportscar$seat <- as.factor(sportscar$seat)
sportscar$price <- as.factor(sportscar$price)
sportscar$choice <- as.logical(sportscar$choice)
sportscar <- sportscar %>% rename(resp.id=resp_id)
sportscar$key <- rep(1:2000, each=3)
row.names(sportscar) <- paste(sportscar$key, sportscar$alt, sep=".")
sportscar <- mlogit.data(sportscar, shape="long", choice="choice", alt.var="alt")
str(sportscar)
# Create a table of chosen sportscars by transmission type
chosen_by_trans <- xtabs(choice ~ trans, data = sportscar)
# Print the chosen_by_trans table to the console
chosen_by_trans
# Plot the chosen_by_price object
barplot(chosen_by_trans)
# Crashes out due to issue with class "family" in MatrixModels and lme4
m1 <- mlogit(choice ~ seat + trans + convert + price, data=sportscar, seed=10)
# fit a choice model using mlogit() and assign the output to m1
# m1 <- mlogit::mlogit(choice ~ seat + trans + convert + price,
# data=sportscar,
# chid.var="key",
# alt.var="alt",
# choice="choice",
# seed=10
# )
# summarize the m1 object to see the output of the choice model
summary(m1)
predict_mnl <- function(model, products) {
# model: mlogit object returned by mlogit()
# data: a data frame containing the set of designs for which you want to
# predict shares. Same format at the data used to estimate model.
data.model <- model.matrix(update(model$formula, 0 ~ .), data = products)[,-1]
utility <- data.model%*%model$coef
share <- exp(utility)/sum(exp(utility))
cbind(share, products)
}
# inspect products
products <- data.frame(seat=factor("2", levels=c("2", "4", "5")),
trans=factor(rep(c("manual", "auto"), each=2), levels=c("auto", "manual")),
convert=factor(rep(c("no", "yes"), times=2), levels=c("no", "yes")),
price=factor("35", levels=c("30", "35", "40"))
)
str(products)
# use predict_mnl to predict share for products
shares <- predict_mnl(m1, products)
# print the shares to the console
shares
barplot(shares$share, ylab="Predicted Market Share",
names.arg=c("Our Car", "Comp 1", "Comp 2", "Comp 3"))
Chapter 2 - Managing and Summarizing Choice Data
Assembling choice data:
Converting from wide to long:
Choice data in two files:
Visualizing choce data:
Designing a conjoint survey:
Example code includes:
chLong <- read.csv("./RInputFiles/chocolate_choice_long.csv")
chWide <- read.csv("./RInputFiles/chocolate_choice_wide.csv")
chocolate_wide <- chWide
# Look at the head() of chocolate_wide
head(chocolate_wide)
## Subject Trial Brand1 Brand2 Brand3 Price1 Price2 Price3
## 1 2401 1 Dove Godiva Dove 0.6 0.7 3.6
## 2 2401 2 Godiva Godiva Hershey's 2.7 3.9 0.7
## 3 2401 3 Hershey's Godiva Hershey's 1.7 3.7 3.0
## 4 2401 4 Lindt Lindt Ghirardelli 1.0 3.6 0.5
## 5 2401 5 Hershey's Godiva Hershey's 0.8 1.5 3.3
## 6 2401 6 Lindt Dove Godiva 3.1 2.5 2.6
## Type1 Type2 Type3 Selection Response_Time
## 1 Milk Dark White 1 5210
## 2 Milk w/ Nuts Dark Milk w/ Nuts 2 7480
## 3 Dark w/ Nuts Dark Dark 2 7704
## 4 Milk Milk w/ Nuts Dark w/ Nuts 1 5774
## 5 Milk w/ Nuts Dark White 2 5238
## 6 Milk White Dark 3 3423
# Use summary() to see which brands and types are in chocolate_wide
summary(chocolate_wide)
## Subject Trial Brand1 Brand2
## Min. :2401 Min. : 1 Dove :60 Dove :85
## 1st Qu.:2405 1st Qu.: 7 Ghirardelli:58 Ghirardelli:67
## Median :2410 Median :13 Godiva :83 Godiva :74
## Mean :2409 Mean :13 Hershey's :63 Hershey's :66
## 3rd Qu.:2413 3rd Qu.:19 Lindt :86 Lindt :58
## Max. :2417 Max. :25
## Brand3 Price1 Price2 Price3
## Dove :69 Min. :0.500 Min. :0.500 Min. :0.500
## Ghirardelli:61 1st Qu.:1.100 1st Qu.:1.300 1st Qu.:1.300
## Godiva :78 Median :2.200 Median :2.400 Median :2.200
## Hershey's :78 Mean :2.144 Mean :2.255 Mean :2.233
## Lindt :64 3rd Qu.:3.100 3rd Qu.:3.200 3rd Qu.:3.100
## Max. :4.000 Max. :4.000 Max. :4.000
## Type1 Type2 Type3 Selection
## Dark :63 Dark :95 Dark :75 Min. :1.000
## Dark w/ Nuts:70 Dark w/ Nuts:68 Dark w/ Nuts:75 1st Qu.:1.000
## Milk :75 Milk :55 Milk :60 Median :2.000
## Milk w/ Nuts:83 Milk w/ Nuts:55 Milk w/ Nuts:67 Mean :1.926
## White :59 White :77 White :73 3rd Qu.:3.000
## Max. :3.000
## Response_Time
## Min. : 1021
## 1st Qu.: 2750
## Median : 3878
## Mean : 4713
## 3rd Qu.: 5766
## Max. :24462
# use reshape() to change the data from long to wide
chocolate <- reshape(data= chocolate_wide , direction="long",
varying = list(Brand=3:5, Price=6:8, Type=9:11),
v.names=c("Brand", "Price", "Type"), timevar="Alt")
# use head() to confirm that the data has been properly transformed
head(chocolate)
## Subject Trial Selection Response_Time Alt Brand Price Type
## 1.1 2401 1 1 5210 1 Dove 0.6 Milk
## 2.1 2401 2 2 7480 1 Godiva 2.7 Milk w/ Nuts
## 3.1 2401 3 2 7704 1 Hershey's 1.7 Dark w/ Nuts
## 4.1 2401 4 1 5774 1 Lindt 1.0 Milk
## 5.1 2401 5 2 5238 1 Hershey's 0.8 Milk w/ Nuts
## 6.1 2401 6 3 3423 1 Lindt 3.1 Milk
## id
## 1.1 1
## 2.1 2
## 3.1 3
## 4.1 4
## 5.1 5
## 6.1 6
# Create the new order for the chocolate data frame
new_order <- order(chocolate$Subject, chocolate$Trial, chocolate$Alt)
# Reorder the chocolate data frame to the new_order
chocolate <- chocolate[new_order,]
# Look at the head() of chocolate to see how it has been reordered
head(chocolate)
## Subject Trial Selection Response_Time Alt Brand Price Type
## 1.1 2401 1 1 5210 1 Dove 0.6 Milk
## 1.2 2401 1 1 5210 2 Godiva 0.7 Dark
## 1.3 2401 1 1 5210 3 Dove 3.6 White
## 2.1 2401 2 2 7480 1 Godiva 2.7 Milk w/ Nuts
## 2.2 2401 2 2 7480 2 Godiva 3.9 Dark
## 2.3 2401 2 2 7480 3 Hershey's 0.7 Milk w/ Nuts
## id
## 1.1 1
## 1.2 1
## 1.3 1
## 2.1 2
## 2.2 2
## 2.3 2
# Use head(chocolate) and look at the Selection variable.
head(chocolate)
## Subject Trial Selection Response_Time Alt Brand Price Type
## 1.1 2401 1 1 5210 1 Dove 0.6 Milk
## 1.2 2401 1 1 5210 2 Godiva 0.7 Dark
## 1.3 2401 1 1 5210 3 Dove 3.6 White
## 2.1 2401 2 2 7480 1 Godiva 2.7 Milk w/ Nuts
## 2.2 2401 2 2 7480 2 Godiva 3.9 Dark
## 2.3 2401 2 2 7480 3 Hershey's 0.7 Milk w/ Nuts
## id
## 1.1 1
## 1.2 1
## 1.3 1
## 2.1 2
## 2.2 2
## 2.3 2
# Transform the Selection variable to a logical indicator
chocolate$Selection <- chocolate$Alt == chocolate$Selection
# Use head(chocolate) to see how the Selection variable has changed. Now it is logical.
head(chocolate)
## Subject Trial Selection Response_Time Alt Brand Price Type
## 1.1 2401 1 TRUE 5210 1 Dove 0.6 Milk
## 1.2 2401 1 FALSE 5210 2 Godiva 0.7 Dark
## 1.3 2401 1 FALSE 5210 3 Dove 3.6 White
## 2.1 2401 2 FALSE 7480 1 Godiva 2.7 Milk w/ Nuts
## 2.2 2401 2 TRUE 7480 2 Godiva 3.9 Dark
## 2.3 2401 2 FALSE 7480 3 Hershey's 0.7 Milk w/ Nuts
## id
## 1.1 1
## 1.2 1
## 1.3 1
## 2.1 2
## 2.2 2
## 2.3 2
choc_choice <- chocolate %>%
filter(Selection==TRUE) %>%
mutate(Selection=Alt) %>%
select(Subject, Trial, Response_Time, Selection)
choc_alts <- chocolate %>%
select(Subject, Trial, Alt, Brand, Price, Type)
str(choc_choice)
## 'data.frame': 350 obs. of 4 variables:
## $ Subject : int 2401 2401 2401 2401 2401 2401 2401 2401 2401 2401 ...
## $ Trial : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Response_Time: int 5210 7480 7704 5774 5238 3423 4691 3268 6719 3542 ...
## $ Selection : int 1 2 2 1 2 3 3 3 2 2 ...
str(choc_alts)
## 'data.frame': 1050 obs. of 6 variables:
## $ Subject: int 2401 2401 2401 2401 2401 2401 2401 2401 2401 2401 ...
## $ Trial : int 1 1 1 2 2 2 3 3 3 4 ...
## $ Alt : int 1 2 3 1 2 3 1 2 3 1 ...
## $ Brand : Factor w/ 5 levels "Dove","Ghirardelli",..: 1 3 1 3 3 4 4 3 4 5 ...
## $ Price : num 0.6 0.7 3.6 2.7 3.9 0.7 1.7 3.7 3 1 ...
## $ Type : Factor w/ 5 levels "Dark","Dark w/ Nuts",..: 3 1 5 4 1 4 2 1 1 3 ...
## - attr(*, "reshapeLong")=List of 4
## ..$ varying:List of 3
## .. ..$ Brand: chr "Brand1" "Brand2" "Brand3"
## .. ..$ Price: chr "Price1" "Price2" "Price3"
## .. ..$ Type : chr "Type1" "Type2" "Type3"
## ..$ v.names: chr "Brand" "Price" "Type"
## ..$ idvar : chr "id"
## ..$ timevar: chr "Alt"
# Merge choc_choice and choc_alts
choc_merge <- merge(choc_choice, choc_alts, by=c("Subject", "Trial"))
# Convert Selection to a logical variable
choc_merge$Selection <- choc_merge$Selection == choc_merge$Alt
# Inspect chocolate_merge using head
head(choc_merge)
## Subject Trial Response_Time Selection Alt Brand Price Type
## 1 2401 1 5210 TRUE 1 Dove 0.6 Milk
## 2 2401 1 5210 FALSE 2 Godiva 0.7 Dark
## 3 2401 1 5210 FALSE 3 Dove 3.6 White
## 4 2401 10 3542 FALSE 1 Lindt 0.6 Milk w/ Nuts
## 5 2401 10 3542 TRUE 2 Godiva 0.8 Milk w/ Nuts
## 6 2401 10 3542 FALSE 3 Hershey's 3.7 Dark
# Use xtabs to count up how often each Type is chosen
counts <- xtabs(~ Type + Selection, data=chocolate)
# Plot the counts
plot(counts, cex = 1.5)
# Modify this code to count up how many times each **Brand** is chosen
counts <- xtabs(~ Brand + Selection, data=chocolate)
# Plot the counts
plot(counts, cex = 1.5)
# Use xtabs to count up how often each Price is chosen
counts <- xtabs(~ Price + Selection, data=chocolate)
# Plot the counts
plot(counts, cex=0.6)
Chapter 3 - Building Choice Models
Choice models - under the hood:
Interpreting choice model parameters:
Intercepts and interactions:
Predicting shares:
Example code includes:
# use mlogit.data() to convert chocolate to mlogit.data
chocolate_df <- mlogit.data(chocolate, shape = "long",
choice = "Selection", alt.var = "Alt",
varying = 6:8)
# use str() to confirm that chocolate is an mlogit.data object
str(chocolate_df)
# Fit a model with mlogit() and assign it to choc_m1
choc_m1 <- mlogit(Selection ~ Brand + Type + Price, data=chocolate_df, print.level=3)
# Summarize choc_m1 with summary()
summary(choc_m1)
# modify the call to mlogit to exclude the intercept
choc_m2 <- mlogit(Selection ~ 0 + Brand + Type + Price, data = chocolate_df, print.level=3)
# summarize the choc_m2 model
summary(choc_m2)
# compute the wtp by dividing the coefficient vector by the negative of the price coefficient
coef(choc_m2) / -coef(choc_m2)["Price"]
# change the Price variable to a factor in the chocolate data
chocolate$Price <- as.factor(chocolate$Price)
# fit a model with mlogit and assign it to choc_m3
choc_m3 <- mlogit(Selection ~ 0 + Brand + Type + Price, data=chocolate)
# inspect the coefficients
summary(choc_m3)
# likelihood ratio test comparing two models
lrtest(choc_m2, choc_m3)
# add the formula for mlogit
choc_m4 <- mlogit(Selection ~ 0 + Brand + Type + Price + Brand:Type, data=chocolate)
# use summary to see the coefficients
summary(choc_m4)
# add the formula for mlogit
choc_m5 <- mlogit(Selection ~ 0 + Brand + Type + Price + Price:Trial, data=chocolate)
# use summary to see the outputs
summary(choc_m5)
# add the formula for mlogit
choc_m5 <- mlogit(Selection ~ 0 + Brand + Type + Price + Price:Trial, data=chocolate)
# use summary to see the outputs
summary(choc_m5)
predict_mnl <- function(model, products) {
data.model <- model.matrix(update(model$formula, 0 ~ .),
data = products)[,-1]
utility <- data.model%*%model$coef
share <- exp(utility)/sum(exp(utility))
cbind(share, products)
}
# modify the code below so that the segement is set to "racer" for both alternatives
price <- c(35, 30)
seat <- factor(c(2, 2), levels=c(2,4,5))
trans <- factor(c("manual", "auto"), levels=c("auto", "manual"))
convert <- factor(c("no", "no"), levels=c("no", "yes"))
segment <- factor(c("racer", "racer"), levels=c("basic", "fun", "racer"))
prod <- data.frame(seat, trans, convert, price, segment)
# predict shares for the "racer" segment
predict_mnl(model=m5, products=prod)
# fit the choc_m2 model
choc_m2 <- mlogit(Selection ~ 0 + Brand + Type + Price, data=chocolate)
# create a data frame with the Ghiradelli products
Brand <- factor(rep("Ghirardelli", 5), level = levels(chocolate$Brand))
Type <- levels(chocolate$Type)
Price <- 3 # treated as a number in choc_m2
ghir_choc <- data.frame(Brand, Type, Price)
# predict shares
predict_mnl(model=choc_m2, products=ghir_choc)
# compute and save the share prediction
shares <- predict_mnl(choc_m2, ghir_choc)
# make a barplot of the shares
barplot(shares$share,
horiz = TRUE, col="tomato2",
xlab = "Predicted Market Share",
main = "Shares for Ghiradelli chocolate bars at $3 each",
names.arg = levels(chocolate$Type)
)
Chapter 4 - Hierarchical Choice Models
What is a hierarchical choice model?
X <- X[X$resp == i & X$task == j, ] u <- X %*% beta[i] p[i,] <- exp(u) / sum(exp(u)) Heterogeneity in preferences for other features:
Predicting shares with hierarchical models:
Wrap up:
Example code includes:
# Determine the number of subjects in chocolate$Subjects
length(levels(chocolate$Subject))
# add id.var input to mlogit.data call
chocolate <- mlogit.data(chocolate, choice = "Selection", shape="long",
varying=6:8, alt.var = "Alt", id.var = "Subject"
)
# add rpar and panel inputs to mlogit call
choc_m6 <- mlogit(Selection ~ 0 + Brand + Type + Price, data = chocolate,
rpar = c(Price="n"), panel=TRUE)
# plot the model
plot(choc_m6)
# set the contrasts for Brand to effects code
contrasts(chocolate$Brand) <- contr.sum(levels(chocolate$Brand))
dimnames(contrasts(chocolate$Brand))[[2]] <- levels(chocolate$Brand)[1:4]
contrasts(chocolate$Brand)
# set the contrasts for Type to effects code
contrasts(chocolate$Type) <- contr.sum(levels(chocolate$Type))
dimnames(contrasts(chocolate$Type))[[2]] <- levels(chocolate$Type)[1:4]
contrasts(chocolate$Type)
# create my_rpar vector
choc_m2 <- mlogit(Selection ~ 0 + Brand + Type + Price, data=chocolate)
my_rpar <- rep("n", length(choc_m2$coef))
names(my_rpar) <- names(choc_m2$coef)
my_rpar
# fit model with random coefficients
choc_m7 <- mlogit(Selection ~ 0 + Brand + Type + Price, data=chocolate, rpar=my_rpar, panel=TRUE)
# print the coefficients
choc_m7$coef[5:8]
# compute the negative sum of those coefficients
-sum(choc_m7$coef[5:8])
# Extract the mean parameters and assign to mean
mean <- choc_m8$coef[1:9]
# Extract the covariance parameters and assign to Sigma
Sigma <- cov.mlogit(choc_m8)
# Create storage for individual draws of share
share <- matrix(NA, nrow=1000, ncol=nrow(choc_line_coded))
# For each draw (person)
for (i in 1:1000) {
# Draw a coefficient vector
coef <- mvrnorm(1, mu=mean, Sigma=Sigma)
# Compute utilities for those coef
utility <- choc_line_coded %*% coef
# Compute probabilites according to logit formuila
share[i,] <- exp(utility) / sum(exp(utility))
}
# Average the draws of the shares
cbind(colMeans(share), choc_line)
Chapter 1 - What Is RNA Single-Cell RNA-Seq?
Background and utility:
Typical workflow:
Load, create, and access data:
Example code includes:
# head of count matrix
counts[1:3, 1:3]
# count of specific gene and cell
alignedReads <- counts['Cnr1', "SRR2140055"]
# overall percentage of zero counts
pZero <- mean(counts == 0)
# cell library size
libsize <- colSums(counts)
# find cell coverage
coverage <- colMeans(counts > 0)
cell_info$coverage <- coverage
# load ggplot2
library(ggplot2)
# plot cell coverage
ggplot(cell_info, aes(x = names, y = coverage)) +
geom_point() +
ggtitle('Cell Coverage') +
xlab('Cell Name') +
ylab('Coverage')
# mean of GC content
gc_mean <- mean(gene_info$gc)
# standard deviation of GC content
gc_sd <- sd(gene_info$gc)
# boxplot of GC content
boxplot(gene_info$gc, main = 'Boxplot - GC content', ylab = 'GC content')
# batch
batch <- cell_info$batch
# patient
patient <- cell_info$patient
# nesting of batch within patient
batch_patient <- table(batch = batch, patient = patient)
# explore batch_patient
batch_patient
# load SingleCellExperiment
library(SingleCellExperiment)
# create a SingleCellExperiment object
sce <- SingleCellExperiment(assays = list(counts = counts ),
rowData = data.frame(gene_names = rownames(counts)),
colData = data.frame(cell_names = colnames(counts)))
# create a SummarizedExperiment object from counts
se <- SummarizedExperiment(assays = list(counts = counts))
# create a SingleCellExpression object from se
sce <- as(se, "SingleCellExperiment")
# create SingleCellExperiment object
sce <- as(allen, "SingleCellExperiment")
# cell information
cell_info <- colData(sce)
# size factors
sizeFactors(sce) <- colSums(assay(sce))
Chapter 2 - Quality Control and Normalization
Quality Control:
Quality Control (continued):
Normalization:
Example code includes:
# remove genes with only zeros
nonZero <- counts(sce) > 0
keep <- rowSums(nonZero) > 0
sce_2 <- sce[keep, ]
# spike-ins ERCC
isSpike(sce_2, "ERCC") <- grepl("^ERCC-", rownames(sce_2))
# load scater
library(scater)
# calculate QCs
sce <- calculateQCMetrics(sce, feature_controls = list(ERCC = isSpike(sce, "ERCC")))
# explore coldata of sce
colData(sce)
# set threshold
threshold <- 20000
# plot density
plot(density(sce@colData$total_counts), main = 'Density - total_counts')
abline(v = threshold)
# keep cells
keep <- sce@colData$total_counts > threshold
# tabulate kept cells
table(keep)
# set threshold
threshold <- 6000
# plot density
plot(density(sce$total_features), main = 'Density - total_features')
abline(v=threshold)
# keep cells
keep <- sce$total_features > threshold
# tabulate kept cells
table(keep)
#extract cell data into a data frame
cDataFrame <- as.data.frame(colData(sce))
# plot cell data
ggplot(cDataFrame, aes(x = total_counts, y = total_counts_ERCC, col = batch)) +
geom_point()
# keep cells
keep <- sce$batch != "NA19098.r2"
# tabulate kept cells
table(keep)
# load SingleCellExperiment
library(SingleCellExperiment)
# filter genes
filter_genes <- apply(counts(sce), 1, function(x){
length(x[x > 1]) > 1
})
# tabulate the results of filter_genes
table(filter_genes)
# PCA raw counts
plotPCA(sce, exprs_values = "counts",
colour_by = "batch", shape_by = "individual")
# PCA log counts
plotPCA(sce, exprs_values = "logcounts_raw",
colour_by = "batch", shape_by = "individual")
#find first 2 PCs
pca <- reducedDim(sce, "PCA")[, 1:2]
#create cdata
cdata <- data.frame(PC1 = pca[, 1],
libsize = sce$total_counts,
batch = sce$batch)
#plot pc1 versus libsize
ggplot(cdata, aes(x = PC1, y = libsize, col = batch)) +
geom_point()
# load scran
library(scran)
# find size factors
sce <- computeSumFactors(sce)
# display size factor histogram
hist(sizeFactors(sce))
# view assays
assays(sce)
# normalize sce
normalized_sce <- normalize(sce)
# view new assay for normalized logcounts
assays(normalized_sce)
Chapter 3 - Visualization and Dimensionality Reduction
Mouse Epithelium Dataset:
Visualization:
Dimensionality Reduction:
Example code includes:
# find dimensions
mydims <- dim(sce)
# extract cell and gene names
cellNames <- colnames(sce)
geneNames <- rownames(sce)
# cell data
cData <- colData(sce)
#print column names
colnames(cData)
# table batch & clusters
cData <- cData[, c('Batch', 'publishedClusters')]
#tabulate cData
table(cData)
# load scater
library(scater)
# plot pc1 and pc2 counts
plotPCA(
object = sce,
exprs_values = "counts",
shape_by = "Batch",
colour_by = "publishedClusters"
)
# explore initial assays
assays(sce)
# create log counts
logcounts <- log1p(assays(sce)$counts)
# add log counts
assay(sce, 'logcounts') <- logcounts
assays(sce)
# pca log counts
plotPCA(object = sce, exprs_values = "logcounts",
shape_by = "Batch", colour_by = "publishedClusters")
# default tSNE
plotTSNE(
sce,
exprs_values = "counts",
shape_by = "publishedClusters",
colour_by = "Batch",
perplexity = 5
)
# gene variance
vars <- assay(sce) %>% log1p() %>% rowVars()
#rename vars
names(vars) <- rownames(sce)
#sort vars
vars_2 <- sort(vars, decreasing = TRUE)
head(vars_2)
# subset sce
sce_sub <- sce[names(vars[1:50]), ]
sce_sub
# log counts
logcounts <- log1p(assays(sce_sub)$counts)
# transpose
tlogcounts <- t(logcounts)
# perform pca
pca <- prcomp(tlogcounts)
# store pca matrix in sce
reducedDims(sce_sub) <- SimpleList(PCA = pca$x)
head(reducedDim(sce_sub, "PCA")[, 1:2])
# Extract PC1 and PC2 and create a data frame
pca <- reducedDim(sce_sub, "PCA")[, 1:2]
col_shape <- data.frame(publishedClusters = colData(sce)$publishedClusters, Batch = factor(colData(sce)$Batch))
df <- cbind(pca, col_shape)
# plot PC1, PC2
ggplot(df, aes(x = PC1, y = PC2,
colour = publishedClusters,
shape = Batch)) +
geom_point()
Chapter 4 - Cell Clustering and Differential Expression Analysis
Clustering methods for scRNA-Seq:
Differential expression analysis:
Pr(>Chisq))], fit[contrast==‘celltype9’ & component==‘logFC’, .(primerid, coef)], by=‘primerid’)Pr(>Chisq), ‘fdr’)]Visualization of DE genes:
Example code includes:
# load Seurat
library(Seurat)
#create seurat object
seuset <- CreateSeuratObject(
raw.data = assay(sce),
normalization.method = "LogNormalize",
scale.factor = 10000,
meta.data = as.data.frame(colData(sce))
)
# scale seuset object
scaled_seuset <- ScaleData(object = seuset)
# perform pca
seuset <- RunPCA(
object = seuset,
pc.genes = rownames(seuset@raw.data),
do.print = FALSE
)
# plot pca
PCAPlot(object = seuset,
pt.shape = 'Batch',
group.by = 'publishedClusters')
# load MAST
library(MAST)
# SingleCellAssay object
sca
# fit zero-inflated regression
zlm <- zlm(~ celltype + cngeneson, sca)
# summary with likelihood test ratio
summary <- summary(zlm, doLRT = "celltype9")
# get summary table
fit <- summary$datatable
# pvalue df
pvalue <- fit[contrast == 'celltype9' & component == 'H', .(primerid, `Pr(>Chisq)`)]
# logFC df
logFC <- fit[contrast == 'celltype9' & component == 'logFC', .(primerid, coef)]
# pvalues and logFC
fit <- merge(pvalue, logFC, by = 'primerid')
# adjusted pvalues
fit[, padjusted:=p.adjust(`Pr(>Chisq)`, 'fdr')]
# result table
res <- data.frame(gene = fit$primerid,
pvalue = fit[,'Pr(>Chisq)'],
padjusted = fit$padj,
logFC = fit$coef)
# most DE
res <- res[order(res$padjusted), ]
mostDE <- res$gene[1:20]
res$mostDE <- res$gene %in% mostDE
# volcano plot
ggplot(res, aes(x=logFC, y=-log10(padjusted), color=mostDE)) +
geom_point() +
ggtitle("Volcano plot") +
xlab("log2 fold change") +
ylab("-log10 adjusted p-value")
# load NMF
library(NMF)
# normalize log counts
norm <- assay(sce[mostDE, ], "logcounts")
mat <- as.matrix(norm)
# heatmap
aheatmap(mat, annCol = colData(sce)$publishedClusters)
Chapter 1 - Differential Expression Analysis
Differential expression analysis:
Differential expression data:
ExpressionSet class:
The limma package:
Example code includes:
# Create a boxplot of the first gene in the expression matrix
boxplot(x[1, ] ~ p[, "Disease"], main = f[1, "symbol"])
# Load package
library(Biobase)
# Create ExpressionSet object
eset <- ExpressionSet(assayData = x,
phenoData = AnnotatedDataFrame(p),
featureData = AnnotatedDataFrame(f))
# View the number of features (rows) and samples (columns)
dim(eset)
# Subset to only include the 1000th gene (row) and the first 10 samples
eset_sub <- eset[1000, 1:10]
# Check the dimensions of the subset
dim(eset_sub)
# Create a boxplot of the first gene in eset_sub
boxplot(exprs(eset_sub)[1, ] ~ pData(eset_sub)[, "Disease"],
main = fData(eset_sub)[1, "symbol"])
# Create design matrix for leukemia study
design <- model.matrix(~Disease, data = pData(eset))
# Count the number of samples modeled by each coefficient
colSums(design)
# Load package
library(limma)
# Fit the model
fit <- lmFit(eset, design)
# Calculate the t-statistics
fit <- eBayes(fit)
# Summarize results
results <- decideTests(fit[, "Diseasestable"])
summary(results)
Chapter 2 - Flexible Models for Common Study Designs
Flexible linear models:
Studies with more than two groups:
Factorial experimental design:
Example code includes:
# Create design matrix with no intercept
design <- model.matrix(~0 + Disease, data = pData(eset))
# Count the number of samples modeled by each coefficient
colSums(design)
# Load package
library(limma)
# Create a contrasts matrix
cm <- makeContrasts(status = Diseaseprogres. - Diseasestable, levels = design)
# View the contrasts matrix
cm
# Load package
library(limma)
# Fit the model
fit <- lmFit(eset, design)
# Fit the contrasts
fit2 <- contrasts.fit(fit, contrasts = cm)
# Calculate the t-statistics for the contrasts
fit2 <- eBayes(fit2)
# Summarize results
results <- decideTests(fit2)
summary(results)
# Create design matrix with no intercept
design <- model.matrix(~0 + oxygen, data = pData(eset))
# Count the number of samples modeled by each coefficient
colSums(design)
# Load package
library(limma)
# Create a contrasts matrix
cm <- makeContrasts(ox05vox01 = oxygenox05 - oxygenox01,
ox21vox01 = oxygenox21 - oxygenox01,
ox21vox05 = oxygenox21 - oxygenox05,
levels = design)
# View the contrasts matrix
cm
# Load package
library(limma)
# Fit the model
fit <- lmFit(eset, design)
# Fit the contrasts
fit2 <- contrasts.fit(fit, contrasts = cm)
# Calculate the t-statistics for the contrasts
fit2 <- eBayes(fit2)
# Summarize results
results <- decideTests(fit2)
summary(results)
# Create single variable
group <- with(pData(eset), paste(type, water, sep = "."))
group <- factor(group)
# Create design matrix with no intercept
design <- model.matrix(~0 + group)
colnames(design) <- levels(group)
# Count the number of samples modeled by each coefficient
colSums(design)
# Load package
library(limma)
# Create a contrasts matrix
cm <- makeContrasts(type_normal = nm6.normal - dn34.normal,
type_drought = nm6.drought - dn34.drought,
water_nm6 = nm6.drought - nm6.normal,
water_dn34 = dn34.drought - dn34.normal,
interaction = (nm6.drought - nm6.normal) - (dn34.drought - dn34.normal),
levels = design)
# View the contrasts matrix
cm
# Load package
library(limma)
# Fit the model
fit <- lmFit(eset, design)
# Fit the contrasts
fit2 <- contrasts.fit(fit, contrasts = cm)
# Calculate the t-statistics for the contrasts
fit2 <- eBayes(fit2)
# Summarize results
results <- decideTests(fit2)
summary(results)
Chapter 3 - Pre-processing and post-processing
Normalizing and filtering:
Accounting for technical batch effects:
Visualizing results:
Enrichment testing:
Example code includes:
# Load package
library(limma)
# View the distribution of the raw data
plotDensities(eset, legend = FALSE)
# Log tranform
exprs(eset) <- log(exprs(eset))
plotDensities(eset, legend = FALSE)
# Quantile normalize
exprs(eset) <- normalizeBetweenArrays(exprs(eset))
plotDensities(eset, legend = FALSE)
# Load package
library(limma)
# View the normalized gene expression levels
plotDensities(eset, legend = FALSE); abline(v = 5)
# Determine the genes with mean expression level greater than 5
keep <- rowMeans(exprs(eset)) > 5
sum(keep)
# Filter the genes
eset <- eset[keep, ]
plotDensities(eset, legend = FALSE)
# Load package
library(limma)
# Plot principal components labeled by treatment
plotMDS(eset, labels = pData(eset)[, "treatment"], gene.selection = "common")
# Plot principal components labeled by batch
plotMDS(eset, labels = pData(eset)[, "batch"], gene.selection = "common")
# Load package
library(limma)
# Remove the batch effect
exprs(eset) <- removeBatchEffect(eset, batch = pData(eset)[, "batch"])
# Plot principal components labeled by treatment
plotMDS(eset, labels = pData(eset)[, "treatment"], gene.selection = "common")
# Plot principal components labeled by batch
plotMDS(eset, labels = pData(eset)[, "batch"], gene.selection = "common")
# Obtain the summary statistics for every gene
stats <- topTable(fit2, number = nrow(fit2), sort.by = "none")
# Plot a histogram of the p-values
hist(stats[, "P.Value"])
# Create a volcano plot. Highlight the top 5 genes
volcanoplot(fit2, highlight = 5, names = fit2$genes$symbol)
# Extract the entrez gene IDs
entrez <- fit2$genes[, "entrez"]
# Test for enriched KEGG Pathways
enrich_kegg <- kegga(fit2, geneid = entrez, species = "Hs")
# View the top 20 enriched KEGG pathways
topKEGG(enrich_kegg, number=20)
# Extract the entrez gene IDs
entrez <- fit2$genes[, "entrez"]
# Test for enriched GO categories
enrich_go <- goana(fit2, geneid = entrez, species = "Hs")
# View the top 20 enriched GO Biological Processes
topGO(enrich_go, ontology = "BP", number=20)
Chapter 4 - Case Study: Effect of Doxorubicin Treatment
Pre-process data:
Model the data:
Inspect the results:
Wrap up:
Example code includes:
# Log transform
exprs(eset) <- log(exprs(eset))
plotDensities(eset, group = pData(eset)[, "genotype"], legend = "topright")
# Quantile normalize
exprs(eset) <- normalizeBetweenArrays(exprs(eset))
plotDensities(eset, group = pData(eset)[, "genotype"], legend = "topright")
# Determine the genes with mean expression level greater than 0
keep <- rowMeans(exprs(eset)) > 0
sum(keep)
# Filter the genes
eset <- eset[keep, ]
plotDensities(eset, group = pData(eset)[, "genotype"], legend = "topright")
# Find the row which contains Top2b expression data
top2b <- which(fData(eset)["symbol"] == "Top2b")
# Plot Top2b expression versus genotype
boxplot(exprs(eset)[top2b, ] ~ pData(eset)[, "genotype"], main = fData(eset)[top2b, ])
# Plot principal components labeled by genotype
plotMDS(eset, labels = pData(eset)[, "genotype"], gene.selection = "common")
# Plot principal components labeled by treatment
plotMDS(eset, labels = pData(eset)[, "treatment"], gene.selection = "common")
# Create single variable
group <- with(pData(eset), paste(genotype, treatment, sep = "."))
group <- factor(group)
# Create design matrix with no intercept
design <- model.matrix(~0 + group)
colnames(design) <- levels(group)
# Count the number of samples modeled by each coefficient
colSums(design)
# Create a contrasts matrix
cm <- makeContrasts(dox_wt = wt.dox - wt.pbs,
dox_top2b = top2b.dox - top2b.pbs,
interaction = (top2b.dox - top2b.pbs) - (wt.dox - wt.pbs),
levels = design)
# View the contrasts matrix
cm
# Fit the model
fit <- lmFit(eset, design)
# Fit the contrasts
fit2 <- contrasts.fit(fit, contrasts = cm)
# Calculate the t-statistics for the contrasts
fit2 <- eBayes(fit2)
# Summarize results
results <- decideTests(fit2)
summary(results)
# Create a Venn diagram
vennDiagram(results)
# Obtain the summary statistics for the contrast dox_wt
stats_dox_wt <- topTable(fit2, coef = "dox_wt", number = nrow(fit2), sort.by = "none")
# Obtain the summary statistics for the contrast dox_top2b
stats_dox_top2b <- topTable(fit2, coef = "dox_top2b", number = nrow(fit2), sort.by = "none")
# Obtain the summary statistics for the contrast interaction
stats_interaction <- topTable(fit2, coef = "interaction", number = nrow(fit2), sort.by = "none")
# Create histograms of the p-values for each contrast
hist(stats_dox_wt[, "P.Value"])
hist(stats_dox_top2b[, "P.Value"])
hist(stats_interaction[, "P.Value"])
# Extract the gene symbols
gene_symbols <- fit2$genes[, "symbol"]
# Create a volcano plot for the contrast dox_wt
volcanoplot(fit2, coef = "dox_wt", highlight = 5, names = gene_symbols)
# Create a volcano plot for the contrast dox_top2b
volcanoplot(fit2, coef = "dox_top2b", highlight = 5, names = gene_symbols)
# Create a volcano plot for the contrast interaction
volcanoplot(fit2, coef = "interaction", highlight = 5, names = gene_symbols)
# Extract the entrez gene IDs
entrez <- fit2$genes[, "entrez"]
# Test for enriched KEGG Pathways for contrast dox_wt
enrich_dox_wt <- kegga(fit2, coef = "dox_wt", geneid = entrez, species = "Mm")
# View the top 5 enriched KEGG pathways
topKEGG(enrich_dox_wt, number = 5)
# Test for enriched KEGG Pathways for contrast interaction
enrich_interaction <- kegga(fit2, coef = "interaction", geneid = entrez, species = "Mm")
# View the top 5 enriched KEGG pathways
topKEGG(enrich_interaction, number = 5)
Chapter 1 - rbokeh Introduction
Getting started with rbokeh:
Layers for rbokeh:
Layers for rbokeh (continued):
Example code includes:
## load rbokeh, gapminder and dplyr libraries
library(rbokeh)
library(gapminder)
library(dplyr)
## explore gapminder dataset
str(gapminder)
## filter gapminder data by year 1982
dat_1982 <- gapminder %>% filter(year == 1982)
## plot life expectancy Vs GDP per Capita using data_1982
figure(legend_location = "bottom_right", title = "Life Expectancy Vs. GDP per Capita in 1982") %>%
ly_points(x = gdpPercap, y = lifeExp, data = dat_1982,
color = continent, hover = c(continent, country, pop)
)
## filter the dataset for the continent Africa and and year 1967
data_africa <- gapminder %>%
filter(year==1967, continent=="Africa")
## view data_africa
data_africa
## plot life expectancy Vs GDP per Capita using data_africa
figure(legend_location = "bottom_right",
title = "Life Expectancy Vs. GDP per Capita in Africa - 1967"
) %>%
ly_points(x = gdpPercap, y = lifeExp, data = data_africa, hover = c(country, pop))
## add a new column with gdp in millions
gapminder_mill <- gapminder %>%
mutate(gdp_millions = gdpPercap * pop / 10^6)
## view the first 6 entries in gapminder after adding gdp_millions
head(gapminder_mill)
## extract the entries for "Rwanda"
data_rwanda <- gapminder_mill %>%
filter(country=="Rwanda")
## explore data_rwanda
data_rwanda
## plot gdp over time
figure(data = data_rwanda) %>%
ly_lines(x = year, y = gdp_millions, width = 2)
## explore the economics dataset
data(economics)
str(economics)
## pass vectors to x & y
figure() %>%
ly_lines(x = economics$date, y = economics$pce)
## pass columns names and dataframe
figure() %>%
ly_lines(x = date, y = pce, data = economics)
## plot unemployment rate versus time and change the default `ylab`
figure(ylab = "unemployment %") %>%
ly_lines(x=date, y=100*unemploy/pop, data=economics)
dat_1992 <- gapminder %>%
filter(year==1992)
str(dat_1992)
## plot lifeExp Vs. gdpPercap using rbokeh
plot_1992<- figure(legend_location = "bottom_right") %>%
ly_points(x=gdpPercap, y=lifeExp, color=continent, data=dat_1992)
## show the plot
plot_1992
data_countries <- gapminder %>%
filter(country %in% c("United Kingdom", "Australia", "Canada", "United States", "New Zealand"))
str(data_countries)
figure(data = data_countries, legend="top_left") %>%
ly_lines(x = year, y = gdpPercap , color = country) %>%
ly_points(x=year, y=gdpPercap, color=country)
data_countries <- gapminder %>%
filter(country %in% c("China", "India"))
## create a line plot with lifeExp vs. year
fig_countries <- figure(legend="top_left") %>%
ly_lines(x=year, y=lifeExp, color=country, data=data_countries)
## View fig_countries
fig_countries
## modify fig_countries by adding a points layer with gdpPercap vs. year
fig_countries %>%
ly_points(x=year, y=lifeExp, color=country, data=data_countries)
Chapter 2 - rbokeh Aesthetic Attributes and Figure Options
Plot and Managed Attributes (Part I):
Plot and Managed Attributes (Part II):
Hover Info and Figure Options:
Example code includes:
hdiRaw <- read.csv("./RInputFiles/Human Development Index (HDI).csv", skip=1)
str(hdiRaw)
hdi_data <- hdiRaw %>%
gather(key="year", value="human_development_index", -Country, -`HDI.Rank..2017.`) %>%
mutate(country=str_trim(as.character(Country)), year=as.integer(str_sub(year, 2))) %>%
filter(year %in% 1990:2105) %>%
select(country, year, human_development_index)
str(hdi_data)
## extract "Namibia" and "Botswana" entries from hdi_data
hdi_countries <- hdi_data %>%
filter(country %in% c("Namibia", "Botswana"))
## plot human_development_index versus year
fig_col <- figure(data = hdi_countries, legend_location = "bottom_right") %>%
ly_lines(x = year, y = human_development_index, color = country) %>%
ly_points(x = year, y = human_development_index,
fill_color = "white", fill_alpha = 1,
line_color = country, line_alpha = 1,
size = 4
)
## view plot
fig_col
## use a custom palette with colors "green", "red"
fig_col %>%
set_palette(discrete_color = pal_color(c("green", "red")))
## define custom palette
custom_pal <- pal_color(c("#c51b8a", "#31a354"))
## use custom_pal yp modify fig_col
fig_col %>%
set_palette(discrete_color=custom_pal)
## explore bechdel dataset using str
data(bechdel, package="fivethirtyeight")
str(bechdel)
## extract entries between 1980 - 2013
dat_80_13 <- bechdel %>%
filter(between(year, 1980, 2013))
dat_80_13 <- dat_80_13 %>%
mutate(roi_total = intgross_2013 / budget_2013)
## plot
figure() %>%
ly_points(x=log(budget_2013), y=log(roi_total), data=dat_80_13)
## plot log(roi_total) versus log(budget_2013)
figure() %>%
ly_points(x=log(budget_2013), y=log(roi_total), size=5, line_alpha=0, fill_alpha=0.3, data=dat_80_13)
## filter the data by country = Syrian Arab Republic
hdi_countries <- hdi_data %>%
filter(country %in% c("Syrian Arab Republic", "Morocco"))
## change the color and line width
figure(title = "Human Development Index over Time", legend = "bottom_right") %>%
ly_lines(x=year, y=human_development_index, color=country, width=3, data=hdi_countries)
# explore hdi_cpi_data dataset
# str(hdi_cpi_2015)
## add multiple values as hover info (country, cpi_rank)
# figure(legend_location = "bottom_right") %>%
# ly_points(x=corruption_perception_index, y=human_development_index, color=continent, hover=c(country, cpi_rank), size=6, data=hdi_cpi_2015)
## modify the figure theme
# figure(title = "Corruption Perception Index Vs. Human Development Index 2015",
# legend_location = "bottom_right", xgrid = FALSE, ygrid = FALSE,
# xlab = "CPI", ylab = "HDI", theme=bk_ggplot_theme()) %>%
# ly_points(x = corruption_perception_index, y = human_development_index,
# data = hdi_cpi_2015, color = continent, size = 6, hover = c(country, cpi_rank)
# )
Chapter 3 - Data Manipulation for Visualization and More rbokeh Layers
Data Formats:
More rbokeh Layers:
Interaction Tools:
Example code includes:
ctry <- c('Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina', 'Australia', 'Austria', 'Bahrain', 'Bangladesh', 'Belgium', 'Benin', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Costa Rica', 'Croatia', 'Cuba', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Eritrea', 'Ethiopia', 'Finland', 'France', 'Gabon', 'Gambia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Haiti', 'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kenya', 'Kuwait', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Madagascar', 'Malawi', 'Malaysia', 'Mali', 'Mauritania', 'Mauritius', 'Mexico', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Norway', 'Oman', 'Pakistan', 'Panama', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Romania', 'Rwanda', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Sierra Leone', 'Singapore', 'Slovenia', 'South Africa', 'Spain', 'Sri Lanka', 'Sudan', 'Sweden', 'Switzerland', 'Thailand', 'Togo', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Uganda', 'United Kingdom', 'United States', 'Uruguay', 'Zambia', 'Zimbabwe', 'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina', 'Australia', 'Austria', 'Bahrain', 'Bangladesh', 'Belgium', 'Benin', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Costa Rica', 'Croatia', 'Cuba', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Eritrea', 'Ethiopia', 'Finland', 'France', 'Gabon', 'Gambia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Haiti', 'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kenya', 'Kuwait', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Madagascar', 'Malawi', 'Malaysia', 'Mali', 'Mauritania', 'Mauritius', 'Mexico', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Norway', 'Oman', 'Pakistan', 'Panama', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Romania', 'Rwanda', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Sierra Leone', 'Singapore', 'Slovenia', 'South Africa', 'Spain', 'Sri Lanka', 'Sudan', 'Sweden', 'Switzerland', 'Thailand', 'Togo', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Uganda', 'United Kingdom', 'United States', 'Uruguay', 'Zambia', 'Zimbabwe')
ctryCode <- c('AFG', 'ALB', 'DZA', 'AGO', 'ARG', 'AUS', 'AUT', 'BHR', 'BGD', 'BEL', 'BEN', 'BIH', 'BWA', 'BRA', 'BGR', 'BFA', 'BDI', 'KHM', 'CMR', 'CAN', 'CAF', 'TCD', 'CHL', 'CHN', 'COL', 'COM', 'CRI', 'HRV', 'CUB', 'CZE', 'DNK', 'DJI', 'DOM', 'ECU', 'EGY', 'SLV', 'ERI', 'ETH', 'FIN', 'FRA', 'GAB', 'GMB', 'DEU', 'GHA', 'GRC', 'GTM', 'GIN', 'GNB', 'HTI', 'HND', 'HUN', 'ISL', 'IND', 'IDN', 'IRQ', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR', 'KEN', 'KWT', 'LBN', 'LSO', 'LBR', 'LBY', 'MDG', 'MWI', 'MYS', 'MLI', 'MRT', 'MUS', 'MEX', 'MNG', 'MON', 'MAR', 'MOZ', 'MMR', 'NAM', 'NPL', 'NLD', 'NZL', 'NIC', 'NER', 'NGA', 'NOR', 'OMN', 'PAK', 'PAN', 'PRY', 'PER', 'PHL', 'POL', 'PRT', 'ROM', 'RWA', 'STP', 'SAU', 'SEN', 'SCG', 'SLE', 'SGP', 'SVN', 'ZAF', 'ESP', 'LKA', 'SDN', 'SWE', 'CHE', 'THA', 'TGO', 'TTO', 'TUN', 'TUR', 'UGA', 'GBR', 'USA', 'URY', 'ZMB', 'ZWE', 'AFG', 'ALB', 'DZA', 'AGO', 'ARG', 'AUS', 'AUT', 'BHR', 'BGD', 'BEL', 'BEN', 'BIH', 'BWA', 'BRA', 'BGR', 'BFA', 'BDI', 'KHM', 'CMR', 'CAN', 'CAF', 'TCD', 'CHL', 'CHN', 'COL', 'COM', 'CRI', 'HRV', 'CUB', 'CZE', 'DNK', 'DJI', 'DOM', 'ECU', 'EGY', 'SLV', 'ERI', 'ETH', 'FIN', 'FRA', 'GAB', 'GMB', 'DEU', 'GHA', 'GRC', 'GTM', 'GIN', 'GNB', 'HTI', 'HND', 'HUN', 'ISL', 'IND', 'IDN', 'IRQ', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR', 'KEN', 'KWT', 'LBN', 'LSO', 'LBR', 'LBY', 'MDG', 'MWI', 'MYS', 'MLI', 'MRT', 'MUS', 'MEX', 'MNG', 'MON', 'MAR', 'MOZ', 'MMR', 'NAM', 'NPL', 'NLD', 'NZL', 'NIC', 'NER', 'NGA', 'NOR', 'OMN', 'PAK', 'PAN', 'PRY', 'PER', 'PHL', 'POL', 'PRT', 'ROM', 'RWA', 'STP', 'SAU', 'SEN', 'SCG', 'SLE', 'SGP', 'SVN', 'ZAF', 'ESP', 'LKA', 'SDN', 'SWE', 'CHE', 'THA', 'TGO', 'TTO', 'TUN', 'TUR', 'UGA', 'GBR', 'USA', 'URY', 'ZMB', 'ZWE')
regn <- c('AP', 'ECA', 'MENA', 'SSA', 'AME', 'AP', 'WE/EU', 'MENA', 'AP', 'WE/EU', 'SSA', 'ECA', 'SSA', 'AME', 'WE/EU', 'SSA', 'SSA', 'AP', 'SSA', 'AME', 'SSA', 'SSA', 'AME', 'AP', 'AME', 'SSA', 'AME', 'WE/EU', 'AME', 'WE/EU', 'WE/EU', 'SSA', 'AME', 'AME', 'MENA', 'AME', 'SSA', 'SSA', 'WE/EU', 'WE/EU', 'SSA', 'SSA', 'WE/EU', 'SSA', 'WE/EU', 'AME', 'SSA', 'SSA', 'AME', 'AME', 'WE/EU', 'WE/EU', 'AP', 'AP', 'MENA', 'WE/EU', 'MENA', 'WE/EU', 'AME', 'AP', 'MENA', 'SSA', 'MENA', 'MENA', 'SSA', 'SSA', 'MENA', 'SSA', 'SSA', 'AP', 'SSA', 'SSA', 'SSA', 'AME', 'AP', 'ECA', 'MENA', 'SSA', 'AP', 'SSA', 'AP', 'WE/EU', 'AP', 'AME', 'SSA', 'SSA', 'WE/EU', 'MENA', 'AP', 'AME', 'AME', 'AME', 'AP', 'WE/EU', 'WE/EU', 'WE/EU', 'SSA', 'SSA', 'MENA', 'SSA', 'ECA', 'SSA', 'AP', 'WE/EU', 'SSA', 'WE/EU', 'AP', 'MENA', 'WE/EU', 'WE/EU', 'AP', 'SSA', 'AME', 'MENA', 'ECA', 'SSA', 'WE/EU', 'AME', 'AME', 'SSA', 'SSA', 'AP', 'ECA', 'MENA', 'SSA', 'AME', 'AP', 'WE/EU', 'MENA', 'AP', 'WE/EU', 'SSA', 'ECA', 'SSA', 'AME', 'WE/EU', 'SSA', 'SSA', 'AP', 'SSA', 'AME', 'SSA', 'SSA', 'AME', 'AP', 'AME', 'SSA', 'AME', 'WE/EU', 'AME', 'WE/EU', 'WE/EU', 'SSA', 'AME', 'AME', 'MENA', 'AME', 'SSA', 'SSA', 'WE/EU', 'WE/EU', 'SSA', 'SSA', 'WE/EU', 'SSA', 'WE/EU', 'AME', 'SSA', 'SSA', 'AME', 'AME', 'WE/EU', 'WE/EU', 'AP', 'AP', 'MENA', 'WE/EU', 'MENA', 'WE/EU', 'AME', 'AP', 'MENA', 'SSA', 'MENA', 'MENA', 'SSA', 'SSA', 'MENA', 'SSA', 'SSA', 'AP', 'SSA', 'SSA', 'SSA', 'AME', 'AP', 'ECA', 'MENA', 'SSA', 'AP', 'SSA', 'AP', 'WE/EU', 'AP', 'AME', 'SSA', 'SSA', 'WE/EU', 'MENA', 'AP', 'AME', 'AME', 'AME', 'AP', 'WE/EU', 'WE/EU', 'WE/EU', 'SSA', 'SSA', 'MENA', 'SSA', 'ECA', 'SSA', 'AP', 'WE/EU', 'SSA', 'WE/EU', 'AP', 'MENA', 'WE/EU', 'WE/EU', 'AP', 'SSA', 'AME', 'MENA', 'ECA', 'SSA', 'WE/EU', 'AME', 'AME', 'SSA', 'SSA')
cnt <- c('Asia', 'Europe', 'Africa', 'Africa', 'Americas', 'Oceania', 'Europe', 'Asia', 'Asia', 'Europe', 'Africa', 'Europe', 'Africa', 'Americas', 'Europe', 'Africa', 'Africa', 'Asia', 'Africa', 'Americas', 'Africa', 'Africa', 'Americas', 'Asia', 'Americas', 'Africa', 'Americas', 'Europe', 'Americas', 'Europe', 'Europe', 'Africa', 'Americas', 'Americas', 'Africa', 'Americas', 'Africa', 'Africa', 'Europe', 'Europe', 'Africa', 'Africa', 'Europe', 'Africa', 'Europe', 'Americas', 'Africa', 'Africa', 'Americas', 'Americas', 'Europe', 'Europe', 'Asia', 'Asia', 'Asia', 'Europe', 'Asia', 'Europe', 'Americas', 'Asia', 'Asia', 'Africa', 'Asia', 'Asia', 'Africa', 'Africa', 'Africa', 'Africa', 'Africa', 'Asia', 'Africa', 'Africa', 'Africa', 'Americas', 'Asia', 'Europe', 'Africa', 'Africa', 'Asia', 'Africa', 'Asia', 'Europe', 'Oceania', 'Americas', 'Africa', 'Africa', 'Europe', 'Asia', 'Asia', 'Americas', 'Americas', 'Americas', 'Asia', 'Europe', 'Europe', 'Europe', 'Africa', 'Africa', 'Asia', 'Africa', 'Europe', 'Africa', 'Asia', 'Europe', 'Africa', 'Europe', 'Asia', 'Africa', 'Europe', 'Europe', 'Asia', 'Africa', 'Americas', 'Africa', 'Europe', 'Africa', 'Europe', 'Americas', 'Americas', 'Africa', 'Africa', 'Asia', 'Europe', 'Africa', 'Africa', 'Americas', 'Oceania', 'Europe', 'Asia', 'Asia', 'Europe', 'Africa', 'Europe', 'Africa', 'Americas', 'Europe', 'Africa', 'Africa', 'Asia', 'Africa', 'Americas', 'Africa', 'Africa', 'Americas', 'Asia', 'Americas', 'Africa', 'Americas', 'Europe', 'Americas', 'Europe', 'Europe', 'Africa', 'Americas', 'Americas', 'Africa', 'Americas', 'Africa', 'Africa', 'Europe', 'Europe', 'Africa', 'Africa', 'Europe', 'Africa', 'Europe', 'Americas', 'Africa', 'Africa', 'Americas', 'Americas', 'Europe', 'Europe', 'Asia', 'Asia', 'Asia', 'Europe', 'Asia', 'Europe', 'Americas', 'Asia', 'Asia', 'Africa', 'Asia', 'Asia', 'Africa', 'Africa', 'Africa', 'Africa', 'Africa', 'Asia', 'Africa', 'Africa', 'Africa', 'Americas', 'Asia', 'Europe', 'Africa', 'Africa', 'Asia', 'Africa', 'Asia', 'Europe', 'Oceania', 'Americas', 'Africa', 'Africa', 'Europe', 'Asia', 'Asia', 'Americas', 'Americas', 'Americas', 'Asia', 'Europe', 'Europe', 'Europe', 'Africa', 'Africa', 'Asia', 'Africa', 'Europe', 'Africa', 'Asia', 'Europe', 'Africa', 'Europe', 'Asia', 'Africa', 'Europe', 'Europe', 'Asia', 'Africa', 'Americas', 'Africa', 'Europe', 'Africa', 'Europe', 'Americas', 'Americas', 'Africa', 'Africa')
idx <- rep(c("corruption_perception_index", "human_development_index"), each=121)
cpiRk <- c(166, 88, 88, 163, 106, 13, 16, 50, 139, 15, 83, 76, 29, 76, 69, 76, 150, 150, 130, 10, 145, 147, 23, 83, 83, 136, 40, 50, 56, 38, 1, 98, 102, 106, 88, 72, 154, 102, 3, 23, 98, 123, 11, 56, 58, 123, 139, 158, 158, 111, 50, 13, 76, 88, 161, 18, 32, 61, 69, 18, 45, 139, 55, 123, 61, 83, 161, 123, 111, 54, 95, 111, 45, 111, 72, 61, 88, 111, 147, 45, 130, 9, 1, 130, 98, 136, 5, 60, 117, 72, 130, 88, 95, 29, 28, 58, 43, 66, 48, 61, 71, 119, 7, 34, 61, 37, 83, 165, 4, 6, 76, 106, 72, 76, 66, 139, 11, 16, 21, 76, 150, 166, 88, 88, 163, 106, 13, 16, 50, 139, 15, 83, 76, 29, 76, 69, 76, 150, 150, 130, 10, 145, 147, 23, 83, 83, 136, 40, 50, 56, 38, 1, 98, 102, 106, 88, 72, 154, 102, 3, 23, 98, 123, 11, 56, 58, 123, 139, 158, 158, 111, 50, 13, 76, 88, 161, 18, 32, 61, 69, 18, 45, 139, 55, 123, 61, 83, 161, 123, 111, 54, 95, 111, 45, 111, 72, 61, 88, 111, 147, 45, 130, 9, 1, 130, 98, 136, 5, 60, 117, 72, 130, 88, 95, 29, 28, 58, 43, 66, 48, 61, 71, 119, 7, 34, 61, 37, 83, 165, 4, 6, 76, 106, 72, 76, 66, 139, 11, 16, 21, 76, 150)
vl <- c(0.479, 0.764, 0.745, 0.533, 0.827, 0.939, 0.893, 0.824, 0.579, 0.896, 0.485, 0.75, 0.698, 0.754, 0.794, 0.402, 0.404, 0.563, 0.518, 0.92, 0.352, 0.396, 0.847, 0.738, 0.727, 0.498, 0.776, 0.827, 0.775, 0.878, 0.925, 0.473, 0.722, 0.739, 0.691, 0.68, 0.42, 0.448, 0.895, 0.897, 0.697, 0.452, 0.926, 0.579, 0.866, 0.64, 0.414, 0.424, 0.493, 0.625, 0.836, 0.921, 0.624, 0.689, 0.649, 0.923, 0.899, 0.887, 0.73, 0.903, 0.742, 0.555, 0.8, 0.763, 0.497, 0.427, 0.716, 0.512, 0.476, 0.789, 0.442, 0.513, 0.781, 0.762, 0.735, 0.807, 0.647, 0.418, 0.556, 0.64, 0.558, 0.924, 0.915, 0.645, 0.353, 0.527, 0.949, 0.796, 0.55, 0.788, 0.693, 0.74, 0.682, 0.855, 0.843, 0.802, 0.498, 0.574, 0.847, 0.494, 0.776, 0.42, 0.925, 0.89, 0.666, 0.884, 0.766, 0.49, 0.913, 0.939, 0.74, 0.487, 0.78, 0.725, 0.767, 0.493, 0.91, 0.92, 0.795, 0.579, 0.516, 11, 36, 36, 15, 32, 79, 76, 51, 25, 77, 37, 38, 63, 38, 41, 38, 21, 21, 27, 83, 24, 22, 70, 37, 37, 26, 55, 51, 47, 56, 91, 34, 33, 32, 36, 39, 18, 33, 90, 70, 34, 28, 81, 47, 46, 28, 25, 17, 17, 31, 51, 79, 38, 36, 16, 75, 61, 44, 41, 75, 53, 25, 49, 28, 44, 37, 16, 28, 31, 50, 35, 31, 53, 31, 39, 44, 36, 31, 22, 53, 27, 84, 91, 27, 34, 26, 88, 45, 30, 39, 27, 36, 35, 63, 64, 46, 54, 42, 52, 44, 40, 29, 85, 60, 44, 58, 37, 12, 89, 86, 38, 32, 39, 38, 42, 25, 81, 76, 74, 38, 21)
hdi_cpi_data_long <- data.frame(country=ctry, year=2015L, country_code=ctryCode, cpi_rank=cpiRk,
region=regn, continent=cnt, index=idx, value=vl,
stringsAsFactors = FALSE
)
## explore hdi_cpi_data_long
str(hdi_cpi_data_long)
## How many unique values are there in the index column?
unique(hdi_cpi_data_long$index)
## convert from long to wide
hdi_cpi_data_wide <- hdi_cpi_data_long %>%
spread(key=index, value=value)
## display the first 5 rows from hdi_cpi_data_wide
head(hdi_cpi_data_wide, 5)
## plot corruption_perception_index versus human_development_index
figure(legend_location = "top_left") %>%
ly_points(x=human_development_index, y=corruption_perception_index, color=continent, alpha=0.7,
hover=c(country, cpi_rank,corruption_perception_index, human_development_index),
data=hdi_cpi_data_wide
)
## convert from wide to long
hdi_cpi_remake_long <- hdi_cpi_data_wide %>%
gather(key="index", value="value", corruption_perception_index, human_development_index)
## display the first 5 rows of hdi_data_long
head(hdi_cpi_remake_long, 5)
all.equal(hdi_cpi_data_long, hdi_cpi_remake_long)
## explore the unique values in the movie_budget column
# unique(dat_90_13_long$movie_budget)
## spread the values in the `movie_budget` in two columns
# dat_90_13_wide <- dat_90_13_long %>%
# spread(key=movie_budget, value=value)
## View column names of dat_90_13_wide
# names(dat_90_13_wide)
## create a scatter plot with log(budget_2013) Vs log(intgross_2013)
# p_scatter <- figure() %>%
# ly_points(y=log(intgross_2013), x=log(budget_2013), size=4, alpha=0.5, data=dat_90_13_wide)
## View plot
# p_scatter
## fit a linear reg model
# lin_reg <- lm(log(intgross_2013) ~ log(budget_2013), data = dat_90_13)
## add the linear regression line layer to p_scatter
# p_scatter %>%
# ly_abline(lin_reg)
## extract entries for year 2007
dat_2007 <- gapminder %>%
filter(year == 2007)
dat_2002 <- gapminder %>%
filter(year == 2002)
## create scatter plot
figure(toolbar_location="above", legend_location="bottom_right") %>%
ly_points(x=gdpPercap, y=lifeExp, color=continent, size=6, alpha=0.7,
data=dat_2007, hover=c(country, lifeExp, gdpPercap)
)
figure(legend_location = "bottom_right", tools=c("resize", "save")) %>%
ly_points(x = gdpPercap, y = lifeExp, data = dat_2002, color = continent)
figure(legend_location = "bottom_right", tools=c("resize", "save"), toolbar_location=NULL) %>%
ly_points(x = gdpPercap, y = lifeExp, data = dat_2002, color = continent)
Chapter 4 - Grid Plots and Maps
Intro to Grid Plots:
Facets with Grid Plots:
rbokeh maps:
Example code includes:
tb <- data.frame(iso2="US",
gender=rep(c("m", "f"), each=84),
year=factor(c(1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008)),
age=c(1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 1524, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 2534, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 3544, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 4554, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 5564, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65),
count=c(355, 333, 330, 321, 331, 365, 320, 343, 365, 362, 383, 388, 414, 375, 876, 815, 701, 663, 616, 602, 613, 562, 526, 547, 535, 568, 490, 513, 1417, 1219, 1127, 1009, 1011, 906, 824, 813, 754, 728, 666, 659, 572, 495, 1121, 1073, 979, 1007, 930, 904, 876, 795, 828, 829, 767, 759, 744, 725, 742, 678, 679, 628, 601, 577, 524, 490, 487, 504, 499, 531, 533, 526, 1099, 1007, 944, 914, 801, 738, 649, 592, 650, 582, 624, 596, 562, 561, 280, 289, 269, 269, 232, 246, 239, 233, 277, 265, 241, 257, 257, 220, 579, 487, 449, 425, 391, 376, 410, 423, 353, 339, 348, 384, 338, 329, 499, 478, 447, 424, 394, 349, 346, 362, 310, 302, 276, 263, 260, 269, 285, 279, 254, 267, 245, 253, 247, 255, 269, 252, 242, 212, 225, 224, 202, 217, 201, 179, 244, 152, 176, 167, 169, 166, 161, 146, 135, 172, 591, 541, 514, 492, 444, 396, 389, 370, 354, 344, 322, 303, 308, 300),
stringsAsFactors = FALSE
)
str(tb)
tb_2534 <- tb %>% filter(age==2534)
str(tb_2534)
## create a bar plot for age group tb_2534
bar_2534 <- figure() %>%
ly_bar(x=year, y=count, color=gender, data=tb_2534, hover=TRUE)
## View figure
bar_2534
## create a bar plot for age group tb_2534 with % on the y-axis
bar_2534_percent <- figure(ylab = "share") %>%
ly_bar(x = year, y = count, tb_2534, color = gender, hover = TRUE, position = "fill")
## View figure
bar_2534_percent
## create a list with bar_2534 and bar_2534_percent figures
fig_list <- list(bar_2534 = bar_2534, bar_2534_percent = bar_2534_percent)
## create a grid plot
grid_plot(fig_list, width=1000, height=400)
## create a grid plot with same axes limits
grid_plot(figs = fig_list, width = 1000, height = 400, same_axes=TRUE)
plot_line <- function(x){
figure() %>%
ly_lines(y = count, year, data = x, color = age, alpha = 1, width = 2)
}
## create two dataframes for female/male data
tb_female <- tb %>% filter(gender=="f")
tb_male <- tb %>% filter(gender=="m")
## create two plots using plot_line
fig_female <- plot_line(tb_female)
fig_male <- plot_line(tb_male)
## create figure list
fig_list <- list(female = fig_female, male = fig_male)
## plot the two figures in a grid
grid_plot(fig_list, width=1000, height=600, same_axes=TRUE)
## split tb data by gender
tb_split_gender <- split(tb, tb$gender)
## create a list of figures using lapply
fig_list <- lapply(tb_split_gender, FUN=plot_line)
## create a grid plot
grid_plot(fig_list, width=1000, height=600, same_axes=TRUE)
## define a function to create a bar plot with the number of tb cases over time
plot_bar <- function(x){
figure() %>%
ly_bar(y=count, x=year, data=x, color = gender, position = "dodge", hover=TRUE)
}
## split tb data by age
tb_split_age <- split(tb, tb$age)
## apply the function to the groups in tb_split_age
fig_list <- fig_list <- lapply(tb_split_age, plot_bar)
## create a grid plot
grid_plot(fig_list, width=600, height=900, nrow=3, same_axes=TRUE) %>%
theme_axis("x", major_label_orientation = 90)
## initialize a map for NY center
# ny_map <- gmap(lat=40.73306, lng=-73.97351, zoom=11, map_style=gmap_style("blue_water"))
# ny_map
## filter ny_bikedata to get the entries for day "2017-04-25"
# ny_bikedata_20170425 <- ny_bikedata %>% filter(trip_date==as.Date("2017-04-25"))
## add a points layer to ny_map
# ny_map %>%
# ly_points(y=station_latitude, x=station_longitude,
# size=8, fill_color=start_count, line_alpha=0,
# data=ny_bikedata_20170425, hover=c(station_name, start_count, end_count)
# )
## create a names list with the two figures
# fig_list <- list(map_weekend=map_weekend_20170423, map_weekday=map_weekday_20170425)
## create a grid plot with the 2 maps
# grid_plot(fig_list, width=860, height=420)
Chapter 1 - Mini Case Study in A/B Testing
Introduction:
Baseline conversion rates:
month(visit_date), y=conversion_rate)) + geom_point() + geom_line()Experimental design and power analysis:
Example code includes:
# Read in data
click_data <- readr::read_csv("./RInputFiles/click_data.csv")
## Parsed with column specification:
## cols(
## visit_date = col_date(format = ""),
## clicked_adopt_today = col_integer()
## )
click_data
## # A tibble: 3,650 x 2
## visit_date clicked_adopt_today
## <date> <int>
## 1 2017-01-01 1
## 2 2017-01-02 1
## 3 2017-01-03 0
## 4 2017-01-04 1
## 5 2017-01-05 1
## 6 2017-01-06 0
## 7 2017-01-07 0
## 8 2017-01-08 0
## 9 2017-01-09 0
## 10 2017-01-10 0
## # ... with 3,640 more rows
# Find oldest and most recent date
min(click_data$visit_date)
## [1] "2017-01-01"
max(click_data$visit_date)
## [1] "2017-12-31"
# Calculate the mean conversion rate by day of the week
click_data %>%
group_by(weekdays(visit_date)) %>%
summarize(conversion_rate = mean(clicked_adopt_today))
## # A tibble: 7 x 2
## `weekdays(visit_date)` conversion_rate
## <chr> <dbl>
## 1 Friday 0.267
## 2 Monday 0.277
## 3 Saturday 0.256
## 4 Sunday 0.3
## 5 Thursday 0.271
## 6 Tuesday 0.271
## 7 Wednesday 0.298
# Calculate the mean conversion rate by week of the year
click_data %>%
group_by(lubridate::week(visit_date)) %>%
summarize(conversion_rate = mean(clicked_adopt_today))
## # A tibble: 53 x 2
## `lubridate::week(visit_date)` conversion_rate
## <dbl> <dbl>
## 1 1 0.229
## 2 2 0.243
## 3 3 0.171
## 4 4 0.129
## 5 5 0.157
## 6 6 0.186
## 7 7 0.257
## 8 8 0.171
## 9 9 0.186
## 10 10 0.2
## # ... with 43 more rows
# Compute conversion rate by week of the year
click_data_sum <- click_data %>%
mutate(weekOfYear = lubridate::week(visit_date)) %>%
group_by(weekOfYear) %>%
summarize(conversion_rate = mean(clicked_adopt_today))
# Build plot
ggplot(click_data_sum, aes(x = `weekOfYear`, y = conversion_rate)) +
geom_point() +
geom_line() +
scale_y_continuous(limits = c(0, 1), labels = scales::percent)
# Compute and look at sample size for experiment in August
total_sample_size <- powerMediation::SSizeLogisticBin(p1 = 0.54, p2 = 0.64,
B = 0.5, alpha = 0.05, power = 0.8
)
total_sample_size
## [1] 758
# Compute and look at sample size for experiment in August with 5% increase
total_sample_size <- powerMediation::SSizeLogisticBin(p1 = 0.54, p2 = 0.59,
B = 0.5, alpha = 0.05, power = 0.8
)
total_sample_size
## [1] 3085
Chapter 2 - Mini Case Study in A/B Testing - Part II
Analyzing Results:
Designing follow-up experiments:
Pre-follow-up-experiment assumptions:
Follow-up experiment assumptions:
Example code includes:
experiment_data <- read_csv("./RInputFiles/experiment_data.csv")
## Parsed with column specification:
## cols(
## visit_date = col_date(format = ""),
## condition = col_character(),
## clicked_adopt_today = col_integer()
## )
experiment_data
## # A tibble: 588 x 3
## visit_date condition clicked_adopt_today
## <date> <chr> <int>
## 1 2018-01-01 control 0
## 2 2018-01-01 control 1
## 3 2018-01-01 control 0
## 4 2018-01-01 control 0
## 5 2018-01-01 test 0
## 6 2018-01-01 test 0
## 7 2018-01-01 test 1
## 8 2018-01-01 test 0
## 9 2018-01-01 test 0
## 10 2018-01-01 test 1
## # ... with 578 more rows
followup_experiment_data <- read_csv("./RInputFiles/eight_month_checkin_data.csv")
## Parsed with column specification:
## cols(
## visit_date = col_date(format = ""),
## condition = col_character(),
## clicked_adopt_today = col_integer()
## )
followup_experiment_data
## # A tibble: 4,860 x 3
## visit_date condition clicked_adopt_today
## <date> <chr> <int>
## 1 2018-01-01 cat_hat 1
## 2 2018-01-01 cat_hat 1
## 3 2018-01-01 cat_hat 0
## 4 2018-01-01 cat_hat 0
## 5 2018-01-01 cat_hat 0
## 6 2018-01-01 cat_hat 0
## 7 2018-01-01 cat_hat 0
## 8 2018-01-01 cat_hat 0
## 9 2018-01-01 cat_hat 1
## 10 2018-01-01 no_hat 0
## # ... with 4,850 more rows
# Group and summarize data
experiment_data_clean_sum <- experiment_data %>%
group_by(condition, visit_date) %>%
summarize(conversion_rate = mean(clicked_adopt_today))
# Make plot of conversion rates over time
ggplot(experiment_data_clean_sum, aes(x = visit_date, y = conversion_rate,
color = condition, group = condition
)
) +
geom_point() +
geom_line()
# View summary of results
experiment_data %>%
group_by(condition) %>%
summarize(conversion_rate = mean(clicked_adopt_today))
## # A tibble: 2 x 2
## condition conversion_rate
## <chr> <dbl>
## 1 control 0.167
## 2 test 0.384
# Run logistic regression
experiment_results <- glm(clicked_adopt_today ~ condition, family = "binomial",
data = experiment_data
) %>%
broom::tidy()
experiment_results
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.61 0.156 -10.3 8.28e-25
## 2 conditiontest 1.14 0.197 5.77 7.73e- 9
# Run logistic regression power analysis
total_sample_size <- powerMediation::SSizeLogisticBin(p1 = 0.39, p2 = 0.59, B = 0.5,
alpha = 0.05, power = 0.8
)
total_sample_size
## [1] 194
# View conversion rates by condition
followup_experiment_data %>%
group_by(condition) %>%
summarize(conversion_rate = mean(clicked_adopt_today))
## # A tibble: 2 x 2
## condition conversion_rate
## <chr> <dbl>
## 1 cat_hat 0.459
## 2 no_hat 0.271
# Run logistic regression
followup_experiment_results <- glm(clicked_adopt_today ~ condition, family = "binomial",
data = followup_experiment_data
) %>%
broom::tidy()
followup_experiment_results
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.163 0.0407 -4.01 6.02e- 5
## 2 conditionno_hat -0.825 0.0611 -13.5 1.66e-41
# Compute monthly summary
eight_month_checkin_data_sum <- followup_experiment_data %>%
mutate(month_text = lubridate::month(visit_date, label = TRUE)) %>%
group_by(month_text, condition) %>%
summarize(conversion_rate = mean(clicked_adopt_today))
# Plot month-over-month results
ggplot(eight_month_checkin_data_sum, aes(x = month_text, y = conversion_rate,
color = condition, group = condition
)
) +
geom_point() +
geom_line()
# Plot monthly summary
ggplot(eight_month_checkin_data_sum, aes(x = month_text, y = conversion_rate,
color = condition, group = condition
)
) +
geom_point() +
geom_line() +
scale_y_continuous(limits = c(0, 1), labels = scales::percent) +
labs(x = "Month", y = "Conversion Rate")
# Compute difference over time
# no_hat_data_diff <- no_hat_data_sum %>%
# spread(year, conversion_rate) %>%
# mutate(year_diff = `2018` - `2017`)
# no_hat_data_diff
# Compute summary statistics
# mean(no_hat_data_diff$year_diff, na.rm = TRUE)
# sd(no_hat_data_diff$year_diff, na.rm = TRUE)
# Run power analysis for logistic regression
total_sample_size <- powerMediation::SSizeLogisticBin(p1 = 0.49, p2 = 0.64, B = 0.5,
alpha = 0.05, power = 0.8
)
total_sample_size
## [1] 341
# View summary of data
# followup_experiment_data_sep %>%
# group_by(condition) %>%
# summarize(conversion_rate=mean(clicked_adopt_today))
# Run logistic regression
# followup_experiment_sep_results <- glm(clicked_adopt_today ~ condition,
# family = "binomial",
# data = followup_experiment_data_sep
# ) %>%
# broom::tidy()
# followup_experiment_sep_results
Chapter 3 - Experimental Design in A/B Testing
A/B Testing Research Questions:
Assumptions and types of A/B testing:
Confounding variables?
Side effects:
Example code includes:
# Compute summary by month
viz_website_2017 %>%
group_by(month(visit_date)) %>%
summarize(article_conversion_rate = mean(clicked_article))
# Compute 'like' click summary by month
viz_website_2017_like_sum <- viz_website_2017 %>%
mutate(month = month(visit_date, label = TRUE)) %>%
group_by(month) %>%
summarize(like_conversion_rate = mean(clicked_like))
# Plot 'like' click summary by month
ggplot(viz_website_2017_like_sum,
aes(x = month, y = like_conversion_rate, group = 1)
) +
geom_point() +
geom_line() +
scale_y_continuous(limits = c(0, 1), labels = percent)
# Plot comparison of 'like'ing and 'sharing'ing an article
ggplot(viz_website_2017_like_share_sum,
aes(x = month, y = conversion_rate, color = action, group = action)
) +
geom_point() +
geom_line() +
scale_y_continuous(limits = c(0, 1), labels = percent)
# Compute conversion rates for A/A experiment
viz_website_2018_01_sum <- viz_website_2018_01 %>%
group_by(condition) %>%
summarize(like_conversion_rate = mean(clicked_like))
viz_website_2018_01_sum
# Plot conversion rates for two conditions
ggplot(viz_website_2018_01_sum, aes(x = condition, y = like_conversion_rate)) +
geom_bar(stat = "identity") +
scale_y_continuous(limits = c(0, 1), labels = percent)
# Run logistic regression
aa_experiment_results <- glm(clicked_like ~ condition, family = "binomial", data = viz_website_2018_01) %>%
broom::tidy()
aa_experiment_results
# Compute 'like' conversion rate by week and condition
viz_website_2018_02 %>%
mutate(week = week(visit_date)) %>%
group_by(week, condition) %>%
summarize(like_conversion_rate = mean(clicked_like))
# Compute 'like' conversion rate by if article published and condition
viz_website_2018_02 %>%
group_by(article_published, condition) %>%
summarize(like_conversion_rate = mean(clicked_like))
# Plot 'like' conversion rates by date for experiment
ggplot(viz_website_2018_02_sum,
aes(x = visit_date, y = like_conversion_rate, color = condition,
linetype = article_published, group = interaction(condition, article_published)
)
) +
geom_point() +
geom_line() +
geom_vline(xintercept = as.numeric(as.Date("2018-02-15"))) +
scale_y_continuous(limits = c(0, 0.3), labels = percent)
# Compute 'like' conversion rate and mean pageload time by day
viz_website_2018_03_sum <- viz_website_2018_03 %>%
group_by(visit_date, condition) %>%
summarize(mean_pageload_time = mean(pageload_time), like_conversion_rate = mean(clicked_like))
# Plot effect of 'like' conversion rate by pageload time
ggplot(viz_website_2018_03_sum, aes(x = mean_pageload_time, y = like_conversion_rate, color = condition)) +
geom_point()
# Plot 'like' conversion rate by day
ggplot(viz_website_2018_03_sum, aes(x = visit_date, y = like_conversion_rate, color = condition,
linetype = pageload_delay_added,
group = interaction(condition, pageload_delay_added)
)
) +
geom_point() +
geom_line() +
geom_vline(xintercept = as.numeric(as.Date("2018-03-15"))) +
scale_y_continuous(limits = c(0, 0.3), labels = percent)
Chapter 4 - Statistical Analyses in A/B Testing
Power analyses:
Statistical tests:
Stopping rules and sequential analysis:
Multivariate testing:
A/B Testing Recap:
Example code includes:
# Run power analysis for logistic regression
total_sample_size <- powerMediation::SSizeLogisticBin(p1 = 0.17, p2 = 0.27,
B = 0.5, alpha = 0.05, power = 0.8
)
total_sample_size
# Run power analysis for t-test
sample_size <- pwr::pwr.t.test(d = 0.3, sig.level = 0.05, power = 0.8)
sample_size
# Run logistic regression
ab_experiment_results <- glm(clicked_like ~ condition, family = "binomial", data = viz_website_2018_04) %>%
broom::tidy()
ab_experiment_results
# Run t-test
ab_experiment_results <- t.test(time_spent_homepage_sec ~ condition, data = viz_website_2018_04)
ab_experiment_results
# Run sequential analysis
seq_analysis_3looks <- gsDesign::gsDesign(k = 3, test.type = 1,
alpha = 0.05, beta = 0.2, sfu = "Pocock"
)
seq_analysis_3looks
# Fill in max number of points and compute points per group and find stopping points
max_n <- 3000
max_n_per_group <- max_n / 2
stopping_points <- max_n_per_group * seq_analysis_3looks$timing
stopping_points
# Compute summary values for four conditions
viz_website_2018_05_sum <- viz_website_2018_05 %>%
group_by(word_one, word_two) %>%
summarize(mean_time_spent_homepage_sec = mean(time_spent_homepage_sec))
# Plot summary values for four conditions
ggplot(viz_website_2018_05_sum, aes(x = word_one, y = mean_time_spent_homepage_sec, fill = word_two)) +
geom_bar(stat = "identity", position = "dodge")
# Compute summary values for four conditions
viz_website_2018_05_sum <- viz_website_2018_05 %>%
group_by(word_one, word_two) %>%
summarize(like_conversion_rate = mean(clicked_like))
# Plot summary values for four conditions
ggplot(viz_website_2018_05_sum, aes(x = word_one, y = like_conversion_rate, fill = word_two)) +
geom_bar(stat = "identity", position = "dodge") +
scale_y_continuous(limits = c(0, 1), labels = percent)
# Organize variables and run logistic regression
viz_website_2018_05_like_results <- viz_website_2018_05 %>%
mutate(word_one = factor(word_one, levels = c("tips", "tools"))) %>%
mutate(word_two = factor(word_two, levels = c("better", "amazing"))) %>%
glm(clicked_like ~ word_one * word_two, family = "binomial", data = .) %>%
broom::tidy()
viz_website_2018_05_like_results
Chapter 1 - Introduction to Mixture Models
Introduction to Model-Based Clustering:
Gaussian Distribution:
Gaussian Mixture Models (GMM):
Example code includes:
gender <- readr::read_csv("./RInputFiles/gender.csv")
## Parsed with column specification:
## cols(
## Gender = col_character(),
## Height = col_double(),
## Weight = col_double(),
## BMI = col_double(),
## probability = col_double()
## )
glimpse(gender)
## Observations: 10,000
## Variables: 5
## $ Gender <chr> "Male", "Male", "Male", "Male", "Male", "Male", "M...
## $ Height <dbl> 73.84702, 68.78190, 74.11011, 71.73098, 69.88180, ...
## $ Weight <dbl> 241.8936, 162.3105, 212.7409, 220.0425, 206.3498, ...
## $ BMI <dbl> 0.04435662, 0.03430822, 0.03873433, 0.04276545, 0....
## $ probability <dbl> 5.778312e-06, 6.059525e-01, 2.625952e-05, 3.628734...
# Have a look to gender (before clustering)
head(gender)
## # A tibble: 6 x 5
## Gender Height Weight BMI probability
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Male 73.8 242. 0.0444 0.00000578
## 2 Male 68.8 162. 0.0343 0.606
## 3 Male 74.1 213. 0.0387 0.0000263
## 4 Male 71.7 220. 0.0428 0.000363
## 5 Male 69.9 206. 0.0423 0.00461
## 6 Male 67.3 152. 0.0337 0.911
# Scatterplot with probabilities
gender %>%
ggplot(aes(x = Weight, y = BMI, col = probability))+
geom_point(alpha = 0.5)
# Set seed
set.seed(1313)
# Simulate a Gaussian distribution
simulation <- rnorm(n = 500, mean = 5, sd = 4)
# Check first six values
head(simulation)
## [1] 2.618374 8.719739 10.469360 11.462134 6.165605 7.497809
# Estimation of the mean
mean_estimate <- mean(simulation)
mean_estimate
## [1] 5.324427
# Estimation of the standard deviation
standard_deviation_estimate <- sd(simulation)
standard_deviation_estimate
## [1] 3.769612
# Transform the results to a data frame
simulation <- data.frame(x = simulation)
# Plot the sample with the estimated curve
ggplot(simulation) +
geom_histogram(aes(x = x, y = ..density..)) +
stat_function(geom = "line", fun = dnorm,
args = list(mean = mean_estimate,
sd = standard_deviation_estimate))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Estimation of the mean
mean_estimate <- gender %>%
pull(Weight) %>%
mean()
mean_estimate
## [1] 161.4404
# Estimation of the standard deviation
sd_estimate <- gender %>%
pull(Weight) %>%
sd()
sd_estimate
## [1] 32.10844
# Plot the sample with the estimated curve
gender %>%
ggplot() +
geom_histogram(aes(x = Weight, y = ..density..), bins = 100) +
stat_function(geom = "line", fun = dnorm,
args = list(mean = mean_estimate, sd = sd_estimate))
# Create coin object
coin <- sample(c(0, 1), size = 500, replace = TRUE, prob = c(0.2, 0.8))
# Sample from two different Gaussian distributions
mixture <- ifelse(coin == 1, rnorm(n = 500, mean = 5, sd = 2), rnorm(n = 500))
# Check the first elements
head(mixture)
## [1] 6.715330 5.157209 2.158443 5.133819 6.089409 1.007223
# Transform into a data frame
mixture <- data.frame(x = mixture)
# Create histogram especifiying that is a density plot
mixture %>% ggplot() +
geom_histogram(aes(x = x, y = ..density..), bins = 50)
number_observations <- 1000
# Create the assignment object
assignments <- sample(c(0, 1 , 2), size = number_observations, replace = TRUE, prob = c(0.3, 0.4, 0.3))
# Simulate the GMM with 3 distributions
mixture <- data.frame(
x = ifelse(assignments == 1, rnorm(n = number_observations, mean = 5, sd = 2),
ifelse(assignments == 2,
rnorm(n = number_observations, mean = 10, sd = 1),
rnorm(n = number_observations)
)
)
)
# Plot the mixture
mixture %>%
ggplot() +
geom_histogram(aes(x = x, y = ..density..), bins = 50)
Chapter 2 - Structure of Mixture Models and Parameter Estimation
Structure of Mixture Models:
Parameter Estimation:
EM Algorithm:
mutate(prob_from_red = proportions[1] * dnorm(x, mean = means[1]), prob_from_blue = proportions[2] * dnorm(x, mean = means[2]), prob_red = prob_from_red/(prob_from_red + prob_from_blue), prob_blue = prob_from_blue/(prob_from_red + prob_from_blue) ) %>% select(x, prob_red, prob_blue) Example code includes:
digits <- readr::read_csv("./RInputFiles/digits.csv")
## Parsed with column specification:
## cols(
## .default = col_integer()
## )
## See spec(...) for full column specifications.
dim(digits)
## [1] 1593 266
digitData <- digits[, 1:256]
digitKey <- digits[, 257:266]
# keep a subset of 4 and 8
digitUse <- rowSums(digitKey[, c(5, 9)]==1)
digData <- digitData[digitUse, ]
digKey <- digitKey[digitUse, ]
show_digit <- function(arr256, col=gray(4:1/4), ...) {
arr256 <- as.numeric(arr256)
image(matrix(arr256, nrow=16)[,16:1],col=col,...)
}
# Dimension
# broom::glance(digits)
# Apply `glimpse` to the data
glimpse(digitData)
## Observations: 1,593
## Variables: 256
## $ V1 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V2 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V3 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ V4 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, ...
## $ V5 <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, ...
## $ V6 <int> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, ...
## $ V7 <int> 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...
## $ V8 <int> 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...
## $ V9 <int> 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...
## $ V10 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, ...
## $ V11 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, ...
## $ V12 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, ...
## $ V13 <int> 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, ...
## $ V14 <int> 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V15 <int> 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V16 <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ V17 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V18 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ V19 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, ...
## $ V20 <int> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, ...
## $ V21 <int> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, ...
## $ V22 <int> 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, ...
## $ V23 <int> 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, ...
## $ V24 <int> 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, ...
## $ V25 <int> 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, ...
## $ V26 <int> 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, ...
## $ V27 <int> 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, ...
## $ V28 <int> 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, ...
## $ V29 <int> 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ V30 <int> 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, ...
## $ V31 <int> 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V32 <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ V33 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ V34 <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, ...
## $ V35 <int> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, ...
## $ V36 <int> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, ...
## $ V37 <int> 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, ...
## $ V38 <int> 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, ...
## $ V39 <int> 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, ...
## $ V40 <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, ...
## $ V41 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...
## $ V42 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, ...
## $ V43 <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, ...
## $ V44 <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, ...
## $ V45 <int> 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, ...
## $ V46 <int> 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, ...
## $ V47 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, ...
## $ V48 <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V49 <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, ...
## $ V50 <int> 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, ...
## $ V51 <int> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, ...
## $ V52 <int> 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, ...
## $ V53 <int> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, ...
## $ V54 <int> 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, ...
## $ V55 <int> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V56 <int> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V57 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ V58 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V59 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V60 <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, ...
## $ V61 <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, ...
## $ V62 <int> 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, ...
## $ V63 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, ...
## $ V64 <int> 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V65 <int> 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, ...
## $ V66 <int> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, ...
## $ V67 <int> 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, ...
## $ V68 <int> 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V69 <int> 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...
## $ V70 <int> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, ...
## $ V71 <int> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V72 <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ V73 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V74 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V75 <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V76 <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V77 <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, ...
## $ V78 <int> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, ...
## $ V79 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, ...
## $ V80 <int> 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, ...
## $ V81 <int> 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, ...
## $ V82 <int> 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, ...
## $ V83 <int> 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, ...
## $ V84 <int> 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...
## $ V85 <int> 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ V86 <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, ...
## $ V87 <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ V88 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V89 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V90 <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V91 <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V92 <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V93 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V94 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, ...
## $ V95 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ V96 <int> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, ...
## $ V97 <int> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, ...
## $ V98 <int> 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...
## $ V99 <int> 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, ...
## $ V100 <int> 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ V101 <int> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ V102 <int> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, ...
## $ V103 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ V104 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V105 <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V106 <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V107 <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V108 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V109 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V110 <int> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ...
## $ V111 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ V112 <int> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ V113 <int> 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ V114 <int> 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...
## $ V115 <int> 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, ...
## $ V116 <int> 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ V117 <int> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ V118 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ V119 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ V120 <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V121 <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V122 <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V123 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V124 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V125 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ V126 <int> 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V127 <int> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ V128 <int> 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...
## $ V129 <int> 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ V130 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...
## $ V131 <int> 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, ...
## $ V132 <int> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ V133 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ V134 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V135 <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V136 <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V137 <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V138 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V139 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V140 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V141 <int> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ V142 <int> 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V143 <int> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ V144 <int> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, ...
## $ V145 <int> 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...
## $ V146 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...
## $ V147 <int> 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, ...
## $ V148 <int> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ V149 <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ V150 <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V151 <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V152 <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V153 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V154 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V155 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V156 <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ V157 <int> 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V158 <int> 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V159 <int> 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...
## $ V160 <int> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, ...
## $ V161 <int> 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...
## $ V162 <int> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, ...
## $ V163 <int> 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, ...
## $ V164 <int> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...
## $ V165 <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...
## $ V166 <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V167 <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V168 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V169 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V170 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V171 <int> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ V172 <int> 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V173 <int> 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V174 <int> 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, ...
## $ V175 <int> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, ...
## $ V176 <int> 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, ...
## $ V177 <int> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, ...
## $ V178 <int> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, ...
## $ V179 <int> 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, ...
## $ V180 <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, ...
## $ V181 <int> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ V182 <int> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V183 <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V184 <int> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V185 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V186 <int> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ V187 <int> 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V188 <int> 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V189 <int> 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, ...
## $ V190 <int> 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, ...
## $ V191 <int> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, ...
## $ V192 <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ V193 <int> 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, ...
## $ V194 <int> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, ...
## $ V195 <int> 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, ...
## $ V196 <int> 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, ...
## $ V197 <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...
## $ V198 <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V199 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V200 <int> 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V201 <int> 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ V202 <int> 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V203 <int> 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V204 <int> 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, ...
## $ V205 <int> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, ...
## $ V206 <int> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, ...
## $ V207 <int> 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, ...
## $ V208 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V209 <int> 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, ...
## $ V210 <int> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, ...
## $ V211 <int> 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, ...
## $ V212 <int> 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ V213 <int> 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, ...
## $ V214 <int> 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...
## $ V215 <int> 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V216 <int> 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V217 <int> 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, ...
## $ V218 <int> 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, ...
## $ V219 <int> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, ...
## $ V220 <int> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, ...
## $ V221 <int> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, ...
## $ V222 <int> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, ...
## $ V223 <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ V224 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V225 <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ V226 <int> 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, ...
## $ V227 <int> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, ...
## $ V228 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, ...
## $ V229 <int> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ V230 <int> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...
## $ V231 <int> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ V232 <int> 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ V233 <int> 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ V234 <int> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...
## $ V235 <int> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, ...
## $ V236 <int> 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, ...
## $ V237 <int> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, ...
## $ V238 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V239 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V240 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V241 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V242 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V243 <int> 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ V244 <int> 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ V245 <int> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, ...
## $ V246 <int> 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...
## $ V247 <int> 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ V248 <int> 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ V249 <int> 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, ...
## $ V250 <int> 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, ...
## $ V251 <int> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, ...
## $ V252 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, ...
## $ V253 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V254 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V255 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ V256 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
# Digit in row 50
show_digit(digitData[50, ])
# Digit in row 100
show_digit(digitData[100, ])
gaussian_sample_with_probs <- data.frame(
x = c(54.5, 7.7, 55.9, 27.9, 4.6, 59.9, 6.4, 60.5, 32.6, 21.3, 0.5, 8.9, 70.7, 49.3, 40.1, 43, 8.1, 62.9, 56, 54.4, 42.5, 46.1, 58.3, 61.7, -11.6, 10.8, 27.5, 12.2, 67.7, -5.6, 13.3, 62.7, 37.2, 41.4, 47.4, 54.2, 31, 60.2, 69.9, 33.8, 25.4, 21.9, 17.9, 61.5, 49.8, 37.9, 55.8, 14.1, 53.3, 45.6, 44.7, 14.2, -5.7, 10.9, 63.7, -6.5, 50.3, 61.4, 35.1, -3.7, 68.4, -6.2, 64, 24.4, 65.7, 59.7, 52.7, 27.2, 17.5, 22.6, 14.7, 22.1, 61.5, 55.6, 62.6, 5.6, 52.3, 8, 25.4, 48.8, 58.4, 6.2, 52.3, 6.6, 64, 43, 60.6, 33.5, 45.8, 2.5, 63, 58.2, 50.9, 22.1, 36.5, 27.1, 61.4, 56.3, 63.5, 55.6, 53.8, 31.9, 30.7, 15.6, 14.8, 44.4, 51.9, 61.4, 11.8, 51.3, 58.6, 45.4, 8.3, 41.5, 52.7, 9.1, 60.8, 40.2, 20.5, 40.2, 59.2, 36.7, 47.5, 12.2, 7.7, 56.2, -13.2, 6, 58.7, 43.7, 67.3, 53.6, 37.6, 54.3, 37.7, 51.9, 10.5, 42, 24, -0.7, 53.1, 27.4, 57.2, 37.3, 28.6, 13.5, 35.2, 22.7, 35.8, 66.9, 45.9, 45.9, 56.7, 55.6, 58.3, 3.2, 45.9, 59.5, 50.8, 43.7, 42.8, 4.7, 29.5, 50.9, 7.8, 44.3, 53.6, 57, 57.8, 47.3, 56.8, 51.1, 27.7, 44.9, 33, 44, 42.1, 38, 52.3, 44, 28.1, 52.7, 53.6, 4.7, 42.1, 40.8, 5, 8, 49.1, 67.5, 16.2, 11.2, 14.6, 32.8, 61.3, 49.8, 51.5, 54.5, 51.6, 45.8, 55.9, 7.4, -10.2, 41.9, 27.4, 45.1, 17.7, 37.5, 53.5, 25.7, 18.1, 13.4, 40.5, 13.3, 2, 49.8, 66.7, 34.7, 11.4, 42.1, 54.4, 48.3, 38.3, 17.4, 48.2, 48.4, 57.4, 54.5, 13.6, 52.3, -0.1, 12.8, 29.3, 45.6, 62.3, 49.2, 32.6, 38.4, 15, 6.1, 12.2, 5.8, 17.7, 20.7, 43.6, 52.3, 42.4, 64.6, 34.3, 9.5, 3.6, 37.2, 45.7, 56.9, 67, 48.7, -3.1, 50.1, 45.4, 54.4, 38.1, 10.8, 7.4, 50.5, 24.7, 11.4, 59.5, 43.9, 4.4, 53.7, 41.9, 60.2, 49.5, 11.6, 51.1, 69.1, 46.2, 35.5, 15, -6.4, 59.9, 57.3, 49.1, 55.5, 55.6, 43.9, 52.5, 46.4, 5.8, 55.3, 22.2, 42.7, 51.3, 40.1, 62.1, 62.2, 48.8, 6.1, 0.6, 19.6, 36.8, 48, 33.8, 52.8, 66.6, 30.2, 45.9, 5.9, 52.7, 49.7, 37.7, 10.4, 60.1, 35.8, 62.1, 35, 38.7, 13.3, -4.9, 30.6, 55.9, 23.7, 12.6, 45.7, 38.1, 9.9, 39.6, 46.3, -3.5, 31.2, 8.3, -8.1, 31.4, 65.7, 10.7, 5.5, 54.4, 51.8, 59.8, 50.3, 45.1, 8.5, 15.3, 3.2, 19.3, 40.8, 48.4, 30.1, 32.7, 12.7, 59.2, 51.4, 55.3, 58.9, -19, 61.9, 30.3, 77.2, 39.8, 31.3, 23.1, 56, 41.9, 0.5, 33.4, 36.6, 54.4, 12.4, 16.4, 24.4, -2.4, 30.9, 56.4, 12.5, 65.2, 10, -1.7, 45.7, 49.5, 45.3, 17.5, 29, -8.7, 51.7, 17.3, 20.2, 14.6, 47.6, 55.3, 50.2, 4.1, 47.5, 71, 13.2, 75.4, 6.2, 53, 54.2, 40.6, 55.1, 67.4, 45, 47.3, 44.2, 8.4, 46.1, 48.7, 8.3, 40.4, 63, 49, 2.8, 50.4, 17.7, 40.4, 41.1, 56.6, 37.3, -0.1, 62.5, 47.7, 62.1, 16.6, 33.3, 4.1, 61, 49.4, 44.1, 18.7, -1.3, 42.1, -11.8, 40.6, 45.6, 14.9, 51.9, 57.4, 41.3, 59.2, 58.6, 50.5, -3.9, -0.6, 11.5, 54.5, 57.1, 46.2, 51.9, 58.2, 51.6, 50.3, 64.2, 8.3, 49, 42, 43.7, 53.4, 6.5, 36.6, -18.2, 41.8, -6.8, 35, 46.8, 43.8, 60.6, -11.3, 18.5, 0.3, 40.2, 73.3, 58.2, 43.9, 22.2, 12.8, 6.7, 36.3, 51.8, 33.6, 71, 56.8, 26, 43.3, 37.4, 60, 17.2, -10.3, 43.9, 69, 38.7, 57.9, 40.2, 48.6, 57.7, 45.8, 56.2, 7.3, 32.1, 41.2, 39.1),
prob_cluster1=c(0, 1, 0, 0.552, 1, 0, 1, 0, 0.158, 0.947, 1, 1, 0, 0, 0.01, 0.003, 1, 0, 0, 0, 0.004, 0.001, 0, 0, 1, 0.999, 0.591, 0.999, 0, 1, 0.998, 0, 0.03, 0.006, 0.001, 0, 0.268, 0, 0, 0.107, 0.773, 0.933, 0.985, 0, 0, 0.023, 0, 0.997, 0, 0.001, 0.002, 0.997, 1, 0.999, 0, 1, 0, 0, 0.065, 1, 0, 1, 0, 0.834, 0, 0, 0, 0.626, 0.988, 0.912, 0.996, 0.928, 0, 0, 0, 1, 0, 1, 0.773, 0, 0, 1, 0, 1, 0, 0.003, 0, 0.118, 0.001, 1, 0, 0, 0, 0.926, 0.038, 0.631, 0, 0, 0, 0, 0, 0.201, 0.286, 0.994, 0.996, 0.002, 0, 0, 0.999, 0, 0, 0.001, 1, 0.005, 0, 1, 0, 0.009, 0.961, 0.009, 0, 0.036, 0, 0.999, 1, 0, 1, 1, 0, 0.002, 0, 0, 0.025, 0, 0.024, 0, 0.999, 0.004, 0.855, 1, 0, 0.604, 0, 0.028, 0.484, 0.997, 0.062, 0.909, 0.05, 0, 0.001, 0.001, 0, 0, 0, 1, 0.001, 0, 0, 0.002, 0.003, 1, 0.398, 0, 1, 0.002, 0, 0, 0, 0.001, 0, 0, 0.576, 0.001, 0.138, 0.002, 0.004, 0.022, 0, 0.002, 0.533, 0, 0, 1, 0.004, 0.007, 1, 1, 0, 0, 0.993, 0.999, 0.996, 0.147, 0, 0, 0, 0, 0, 0.001, 0, 1, 1, 0.005, 0.606, 0.001, 0.987, 0.026, 0, 0.754, 0.985, 0.998, 0.008, 0.998, 1, 0, 0, 0.076, 0.999, 0.004, 0, 0, 0.019, 0.988, 0, 0, 0, 0, 0.997, 0, 1, 0.998, 0.417, 0.001, 0, 0, 0.162, 0.018, 0.995, 1, 0.998, 1, 0.987, 0.956, 0.002, 0, 0.004, 0, 0.087, 0.999, 1, 0.03, 0.001, 0, 0, 0, 1, 0, 0.001, 0, 0.021, 0.999, 1, 0, 0.82, 0.999, 0, 0.002, 1, 0, 0.005, 0, 0, 0.999, 0, 0, 0.001, 0.057, 0.995, 1, 0, 0, 0, 0, 0, 0.002, 0, 0.001, 1, 0, 0.925, 0.003, 0, 0.009, 0, 0, 0, 1, 1, 0.971, 0.035, 0, 0.104, 0, 0, 0.329, 0.001, 1, 0, 0, 0.024, 0.999, 0, 0.05, 0, 0.067, 0.016, 0.998, 1, 0.298, 0, 0.87, 0.998, 0.001, 0.021, 0.999, 0.012, 0.001, 1, 0.247, 1, 1, 0.233, 0, 0.999, 1, 0, 0, 0, 0, 0.001, 1, 0.995, 1, 0.975, 0.007, 0, 0.34, 0.157, 0.998, 0, 0, 0, 0, 1, 0, 0.325, 0, 0.011, 0.246, 0.895, 0, 0.005, 1, 0.12, 0.037, 0, 0.998, 0.992, 0.834, 1, 0.273, 0, 0.998, 0, 0.999, 1, 0.001, 0, 0.001, 0.987, 0.446, 1, 0, 0.989, 0.964, 0.996, 0, 0, 0, 1, 0, 0, 0.998, 0, 1, 0, 0, 0.008, 0, 0, 0.001, 0.001, 0.002, 1, 0.001, 0, 1, 0.008, 0, 0, 1, 0, 0.986, 0.008, 0.006, 0, 0.028, 1, 0, 0, 0, 0.991, 0.128, 1, 0, 0, 0.002, 0.98, 1, 0.004, 1, 0.008, 0.001, 0.996, 0, 0, 0.006, 0, 0, 0, 1, 1, 0.999, 0, 0, 0.001, 0, 0, 0, 0, 0, 1, 0, 0.004, 0.002, 0, 1, 0.037, 1, 0.005, 1, 0.068, 0.001, 0.002, 0, 1, 0.981, 1, 0.009, 0, 0, 0.002, 0.925, 0.998, 1, 0.042, 0, 0.114, 0, 0, 0.725, 0.003, 0.027, 0, 0.989, 1, 0.002, 0, 0.016, 0, 0.009, 0, 0, 0.001, 0, 1, 0.189, 0.006, 0.014)
)
gaussian_sample_with_probs <- gaussian_sample_with_probs %>%
mutate(prob_cluster2 = 1-prob_cluster1)
glimpse(gaussian_sample_with_probs)
## Observations: 500
## Variables: 3
## $ x <dbl> 54.5, 7.7, 55.9, 27.9, 4.6, 59.9, 6.4, 60.5, 32....
## $ prob_cluster1 <dbl> 0.000, 1.000, 0.000, 0.552, 1.000, 0.000, 1.000,...
## $ prob_cluster2 <dbl> 1.000, 0.000, 1.000, 0.448, 0.000, 1.000, 0.000,...
# Estimation of the means
means_estimates <- gaussian_sample_with_probs %>%
summarise(mean_cluster1= sum(x*prob_cluster1)/sum(prob_cluster1),
mean_cluster2 = sum(x*prob_cluster2)/sum(prob_cluster2)
)
means_estimates
## mean_cluster1 mean_cluster2
## 1 10.39535 49.46501
# Estimation of the proportions
props_estimates <- gaussian_sample_with_probs %>%
summarise(props_cluster1 = mean(prob_cluster1),
props_cluster2 = mean(prob_cluster2)
)
props_estimates
## props_cluster1 props_cluster2
## 1 0.33148 0.66852
# Transform to a vector
means_estimates <- as.numeric(means_estimates)
# Plot histogram with means estimates
ggplot(gaussian_sample_with_probs) + geom_histogram(aes(x = x), bins = 100) +
geom_vline(xintercept = means_estimates)
gaussian_sample <- data.frame(
x=c(6.4, 5.9, 57.8, 52.6, 54.3, 52.3, 4.4, 49.1, -4, 12.7, 19.8, 51.8, 35.4, 17.1, 38.8, 44.1, 45.6, 7.9, 57.7, 51.1, 14.1, 36.6, 51.6, 4.1, -1.8, 55.1, 52.4, 54.4, 47.9, 36.6, 53.9, 15, 68.8, 8.3, 40.8, 39.3, 37.1, 12.7, 54.6, 34.1, 24.9, 58.5, 50.8, 48.6, 60, 52.1, 61.5, 6.9, 63, 63.5, 54.1, 37.7, 52.6, 49.1, 53.7, 13.4, 23.6, 45.5, 33.4, 46.4, 46.6, 56.1, 37.8, 44.1, 62.4, 12, 54.4, 31.6, -1, 9.4, 16, 53.4, 71.1, 8.9, 64.4, 55.9, 50.5, 57.2, 45.9, 18.5, 53.9, 12.5, 12.2, 1.5, 0.3, 40.1, 13.9, 53.2, 12.1, 57.2, 2.3, -2.6, 2.7, 59.6, 3, 10.3, 66.9, 57.3, 57.6, 9.1, 43.8, 51.1, 7.7, 13.4, 46.3, 57.5, 0.2, 1.9, 43.8, 53.9, 9.3, 45.5, 15.4, -3.2, -1.2, 40.5, 1.9, 14.5, -2, 3.4, 54.1, 2.9, 58.2, 49.5, 49.1, 60.2, 45.3, 59.7, 38, 22.4, 42.6, 53.6, 7.3, 43.9, 2.8, 66.5, 56.5, 44.4, 53.5, 40.6, 57.1, 43.8, -3.1, 47.3, 42.5, 50.8, -12, -12, 15.2, 43.8, 57.3, 32.2, 61.1, 15.1, 5.8, 24.7, 51.5, 7.7, -5.1, 63.1, 50.1, 39.9, 38.7, -5.2, 50.3, 49.1, 58.1, 31.3, 54.6, 39.1, 4.4, 60.5, 45.6, 59.7, 39.5, 60.6, 42.8, 49.5, 12.9, 47.2, 50, 11.4, 50.9, 57.3, 46.7, 35.6, 38.8, 56, -5.7, 50.5, 21.2, 45.9, 60.7, 22.1, 46.7, 12.5, 55.2, 48.4, 36.6, 54, 47, 50.3, 51.7, 11, 56, 42.4, 61.8, 45.6, 60.5, 40.6, 8.8, 21, 5.6, 68.2, 21.3, 11.5, 47.2, 26.4, 35.8, 25.4, 19.6, 56, 9.1, 63.4, 48.5, 3.2, 57.1, 52.7, 11.3, 16.3, 49, 46.5, 12.4, 9.6, 45.5, 55.3, 72.9, 8.1, -3.8, 53.8, 34.1, 45.7, 56.3, 44, 23.4, 57.2, 0.5, 33.2, 63.4, 37.3, 57.3, 52.7, 9.7, 51.9, 39.4, 63.7, 23.3, 39.9, -0.5, 41.6, 11.3, 48, 38.2, 54.2, 41.3, 30.6, 55.2, 48.9, 34.4, 16.2, 45.7, 10.1, 42.7, 12.2, 39.5, 14.1, 64.9, 53.1, 50.4, 47, 58.5, 50.8, 43.9, 56.8, 12.6, 44.5, 54.6, 8.9, 15.5, 50.2, 4.8, 52.8, 14.4, 33.7, 5.4, -0.2, 19.8, 51, 59.4, -8.2, 10.4, 47.8, 31.2, 41.4, 9.4, -3.2, 21.1, 44.7, 22.9, 11.5, 49.6, 26.7, 11.5, 35.2, 9.4, 44.8, 63.1, 8.5, 21, 30.9, 16.1, 54.4, 53.4, 9.7, 49.8, 45.6, -3, 53, 43.4, 43.4, 43.9, 56.6, 33.5, 55.1, 54.4, 62.8, 37.9, 35.1, 8.6, 7.1, 46.1, 6.1, 27, -9.9, 6.4, 44.6, 49, 46, 42.4, 9.5, 47.1, 51.3, -4.7, 14, 64.8, 38, 33.6, -0.4, 53.5, 40.3, 47.2, 58.5, 45.4, 2.5, 52.9, 47.4, 56.1, 17.7, 3.9, 30.7, 44.6, 42.4, 55.4, 47.1, 11.5, 50.7, 47.6, 11.3, 45.1, 44.2, 46.6, 36.9, 47.4, 54.6, -2, 50.7, 63.6, 58.9, 7.6, -3.1, 31.1, 44.9, 55.7, 16.6, 64.3, 27.1, 23, 48.7, -0.8, 23.6, 72.8, 11.9, 57.3, 25.4, 47.1, 9.4, 57.6, 39.6, 25.3, 31.2, 52.4, 51.1, 1.6, 76.5, 50.7, 34.2, 7.6, 25.4, 11.7, 53.5, 17.5, 53.7, 61.2, 49.9, 48.8, 40.8, 61.2, 16.4, 48.6, 7.5, -2, 64.2, 26.2, 11.2, 3.2, -4.3, 37.9, 47.7, 26.3, 58, 66.9, 59.1, 35.8, 14.2, 53, 60.3, 63.3, 53.6, 47.6, 57.1, 37, 47.6, 61.6, 52.7, 0.8, 50.5, 48.1, -3.4, 53.6, 35.7, 49.8, 2.7, 59.9, 36.5, 63.6, 53.3, 3.8, 20.2, 19.7, 20.7, 45.6, 39.8, 37.2, 38.6, 12.4, 56.3, 59.6, 10.5, 11, -6.8, 58.8, 49.5, -3.6, 51.1, 53.1, 46, 57.9, 15.2, -2.3, 22.9, 32.8, 37.6, 52, 77.5, 2.2, 9.5, 40.4, 48.5, 27.2, 37.4)
)
str(gaussian_sample)
## 'data.frame': 500 obs. of 1 variable:
## $ x: num 6.4 5.9 57.8 52.6 54.3 52.3 4.4 49.1 -4 12.7 ...
# Create data frame with probabilities
gaussian_sample_with_probs <- gaussian_sample %>%
mutate(prob_from_cluster1 = 0.35 * dnorm(x, mean = 10, sd = 10),
prob_from_cluster2 = 0.65 * dnorm(x, mean = 50, sd = 10),
prob_cluster1 = prob_from_cluster1/(prob_from_cluster1 + prob_from_cluster2),
prob_cluster2 = prob_from_cluster2/(prob_from_cluster1 + prob_from_cluster2)) %>%
select(x, prob_cluster1, prob_cluster2)
head(gaussian_sample_with_probs)
## x prob_cluster1 prob_cluster2
## 1 6.4 9.998524e-01 0.0001475847
## 2 5.9 9.998792e-01 0.0001208354
## 3 57.8 7.976210e-06 0.9999920238
## 4 52.6 6.384176e-05 0.9999361582
## 5 54.3 3.234434e-05 0.9999676557
## 6 52.3 7.198080e-05 0.9999280192
expectation <- function(data, means, proportions, sds){
# Estimate the probabilities
exp_data <- data %>%
mutate(prob_from_cluster1 = proportions[1] * dnorm(x, mean = means[1], sd = sds[1]),
prob_from_cluster2 = proportions[2] * dnorm(x, mean = means[2], sd = sds[2]),
prob_cluster1 = prob_from_cluster1/(prob_from_cluster1 + prob_from_cluster2),
prob_cluster2 = prob_from_cluster2/(prob_from_cluster1 + prob_from_cluster2)) %>%
select(x, prob_cluster1, prob_cluster2)
# Return data with probabilities
return(exp_data)
}
maximization <- function(data_with_probs){
means_estimates <- data_with_probs %>%
summarise(mean_1 = sum(x * prob_cluster1) / sum(prob_cluster1),
mean_2 = sum(x * prob_cluster2) / sum(prob_cluster2)
) %>%
as.numeric()
props_estimates <- data_with_probs %>%
summarise(proportion_1 = mean(prob_cluster1), proportion_2 = 1 - proportion_1) %>%
as.numeric()
list(means_estimates, props_estimates)
}
means_init <- c(0, 100)
props_init <- c(0.5, 0.5)
# Iterative process
for(i in 1:10){
new_values <- maximization(expectation(gaussian_sample, means_init, props_init, c(10, 10)))
means_init <- new_values[[1]]
props_init <- new_values[[2]]
cat(c(i, means_init, props_init), "\n")
}
## 1 25.28863 56.90005 0.6797875 0.3202125
## 2 20.01129 53.44814 0.539439 0.460561
## 3 14.77156 51.48322 0.4377961 0.5622039
## 4 11.62146 50.28191 0.3846544 0.6153456
## 5 10.34764 49.72052 0.363436 0.636564
## 6 9.918957 49.49888 0.355935 0.644065
## 7 9.777305 49.41932 0.3533705 0.6466295
## 8 9.730017 49.39192 0.3525025 0.6474975
## 9 9.714139 49.38262 0.3522096 0.6477904
## 10 9.708796 49.37948 0.3521109 0.6478891
fun_gaussian <- function(x, mean, proportion){
proportion * dnorm(x, mean, sd = 10)
}
means_iter10 <- means_init
props_iter10 <- props_init
gaussian_sample %>% ggplot() +
geom_histogram(aes(x = x, y = ..density..), bins = 200) +
stat_function(geom = "line", fun = fun_gaussian,
args = list(mean = means_iter10[1], proportion = props_iter10[1])
) +
stat_function(geom = "line", fun = fun_gaussian,
args = list(mean = means_iter10[2], proportion = props_iter10[2])
)
Chapter 3 - Mixture of Gaussians with flexmix
Univariate Gaussian Mixture Models:
Univariate Gaussian Mixture Models with flexmix:
Bivariate Gaussian Mixture Models with flexmix:
Bivariate Gaussian Mixture Models with flexmix:
geom_path(data = data.frame(ellipse_comp_1), aes(x=x,y=y), col = "red") + geom_path(data = data.frame(ellipse_comp_2), aes(x=x,y=y), col = "blue" Example code includes:
xExample <- c(7.3, 58.7, 9.7, 16.9, 6.3, 35.1, 33.5, 61.3, 28.3, 24.3, 58.6, 13.1, 58.7, 34, 29.1, 46.4, 54.6, 5.9, 30.6, 27.9, 27.5, -5.3, 37.6, 9.1, 44.5, 57.5, 30.5, 5, 51.9, 33.6, 37.4, 28.8, 47.9, 5.4, 64.1, 45.1, 41, 36.3, 28.2, 33.8, 9.8, 57.4, 48.4, 58.3, 27.7, 38.4, 36.4, 66.9, 30.7, 34.3, 25.9, 48.5, 52, 0.3, 45.3, 31.9, 21.6, 36.6, 29, 13.2, 41.5, 8.2, 46.6, 30.6, 48.6, 5.6, 39.3, 30.5, 34.2, 61.5, 4.2, 71.3, 42.5, 32.7, 54.4, 19.2, 13.3, 40.3, 72, 21.8, 49.5, 38.7, 9.6, 49.6, 32, 30.9, 28.6, 30.1, 29.8, 67.9, 60.8, 55, 34.6, 32.8, 11.9, 50.5, 32.1, 13.7, 48.6, 32.6, 9.1, 27.6, 35.6, 28.3, 15.1, 54.7, 30.8, 22.2, 27.5, 49.3, 56, 26.1, 57.2, 46.4, 50.3, 43.6, 51.8, 47.5, 15.5, 60.2, 63.6, 45.3, 14.1, 42.1, 31.4, 42.4, 61.7, 60.1, 27.7, 55.9, 3.3, 18.7, 58.1, 46, 14, 41.7, 28.9, 29.1, 56.9, 32.3, -0.8, 29.4, 27.3, 33.5, 39.1, 13.9, 28.7, 29.4, 10.3, 44.3, 57.1, 76, 49.4, 44.9, 23.2, 53.9, 33.6, 32.7, 30, 57, 63.6, 32.9, 8.6, 26.5, 26, 53.3, 40.8, 30.1, 10.5, 47.2, 30.2, 49.3, 52.4, 48.8, 51.4, 40.7, 33.8, 45.7, 28.1, 13.2, 28.4, 31.7, 30, 29.6, 49.5, 35, 62, 51.9, 39, 15.4, 59.1, 54.8, 9.2, 9.7, 35.4, 32.9, 31.3, 30.4, 64.4, 63.4, 32.9, 40.6, 37.5, 52.3, 35.3, 8.1, 6.4, 26.2, 29.2, 29.7, 27.8, 35.2, 34.1, 29.8, 49, 65.6, -1.1, 28.6, 33.7, 48.1, 45.7, 30.3, 32.7, 64.5, 29.8, 52.5, 48.4, 48.8, 26.4, 37.4, 33.2, 46.1, 29.5, -0.9, 49.8, 34.1, 48.9, 12.5, 36.6, 22.1, 57.3, 9.5, 9.4, 58.5, 50.2, 45.3, 25.3, 27.4, 4.5, 58.5, 63.4, 48.7, 42.6, 33, 47.9, 30.3, 54.9, 7.9, 50.2, 11.2, 59.7, 46.5, 57.5, 26.9, 28.5, 29.7, 52.5, 16.9, 29.8, 28.6, 31.2, 65.3, 1.7, 31.4, 52.5, 5.1, 66.1, 51.5, 9.5, 9.8, 41.6, 0.3, 10.4, 15.5, 34.8, 27.5, 43.6, 31.4, 46.3, 4.6, 45.8, 49.2, 10.7, 48.1, 7.3, 33.4, 10.7, 53.4, 28.9, 51.1, 52.4, 55.9, 56.8, 47.2, 46.8, 30.8, 60.3, 53.6, 30.9, 70.8, 11.2, 7.5, 55.8, 14.3, 25.8, 14.5, 30.9, 60.8, 26.8, 16.5, 31.4, 26.6, 10.6, 53.4, 33.1, 33.1, 46.3, 8.2, 56, 14.1, 25.5, 59.6, 61.9, 58.6, 63.1, 47.7, 30.5, 42.4, 56.2, 17, 13.4, 34.4, 1.1, 18.4, 63.9, 38.6, 15, 30.1, 23.9, 5.9, 53.8, 18.2, 22.7, 45.7, 29.2, 8.4, 52.5, 42, 28.7, 61.7, 35.4, 32.5, 5.5, 6.8, 60.1, 29.4, 31.5, 2.3, 28.3, 29.6, 34.9, 33.2, 28.9, 33.9, 51, 35.4, 52.3, 60, 27.1, 24.7, 57.7, 32.7, 52.5, 66.3, 37.8, 46.3, 38.1, 30.6, 55.6, 44.9, 28.4, 28.9, 19, 7.7, 9.4, 36, 49.9, 42.2, 28.2, 11.5, 52.4, 46.3, 52.4, 27.4, 15.6, 62.3, 51.7, 41.6, 6.2, 10.5, 14.7, 30.4, 23.9, 58.7, 36.1, 47.6, 31.2, 29.1, 60.1, 18, 30, 56.5, 42.7, 27.1, 45.5, 36.6, 46.4, 25.9, 15.4, 31.6, 3.3, 33.6, 63.3, 57.1, 32.3, 11.8, 32.9, 47.2, 31.2, 49.3, 61.7, 11.5, 9.7, 49.6, 45.7, 16.1, 27.4, 22.8, 8.5, 56.2, 26, 45.7, 29, 34.6, 29.4, 3.9, 45.7, 31.7, 52.6, 40.2, 35.5, 5.8, 56.4, 49.5, 30.6, 40.2, 20.8, 43.9, 32.1, 40.8, 45.6, 32.8, 7.4, 27.5, 29.4, 50.8, 43.9, 36.8, 5.5, 61.5, 41.5, 47.5, 13.9, 30.1, 67.3, 27.1, 50.8, 37.4, 28, 25, 37.1, 49.3, 25.3, 26.9, 34.9, 51.8, 33.9, 34.7, 44.2, 10.1, 71.3, 47.5, 23.4, 45.7, 49.4, 32.6, 6.9, 67.8, 56.8, 41.9, 50.7, 31.5, 55, 14.2, 34.8, 26.2, 25.8, 64, 63.8, 56.4, 42.1, 29.5, 49.4, 30.2, 16.2, 30, -0.2, 30.7, 29.6, 57, 41.5, 6.4, 9.7, 47.1, 19.4, 39.8)
xExample <- c(xExample, 32.9, 53.6, 8.4, 32.8, 63.1, 58.4, 7.5, 26, 41.8, 29, 36.9, 41.5, 39.5, 14.1, 27.4, 14.9, 48.4, 34.8, 72.8, 36.9, 27.8, 27.6, 6.1, 43.8, 36.9, 58.5, 55.1, 45.2, 2.6, 20.4, 59, 60.6, 57.7, 29.8, 60.2, 36.9, 29, 28, 46.5, 55, 29.6, 52.6, 38, 45.3, 5.7, 44.8, 35.3, 56.1, 30.3, 32.4, 56.9, 30.8, 44.8, 62.8, 46.1, 57.2, 50.5, 46.4, 37.6, 29.9, 8.6, 35.5, 47.4, 27.2, 36.4, 33.1, 29.4, 25.8, 46, 27.6, 45.7, 32.3, 12.8, 49.8, 13.7, 65.3, 48.5, 39.6, 4, 32.1, 49.6, 44, 74.5, 31, 52.6, 33.3, 56.8, 11.4, 33.7, 34.3, 25.8, 39.8, 7.3, 33.6, 7.9, 49.6, 52.6, 36.5, 43, 14.7, 43.5, 37, 50.8, 46.5, 46.9, 25.4, 32.7, 48.4, 40.3, 45.9, 51.3, 24, 48.3, 39.5, 21.2, 48.1, 56.9, 32.3, 10.2, 9.3, 40.3, 52.8, 34.5, 32.4, 30.1, 10.8, -3.8, 30.4, 58.2, 57.3, 48.9, 36.1, 46.2, 69, 67.8, 58.5, 41.9, 29.6, 51.7, 39.4, 50.8, 29.2, 56.1, 54.4, 17.2, 57.5, 54.1, 48.6, -0.9, 56.3, 27.7, 58.8, 57, 44.1, 6.3, 4.1, 35.9, 60.2, 44.1, 53.9, 33.3, 35.4, 32.1, 56, 56.8, 30.1, 43.1, 64.6, 27.7, 30.7, 53, 66, 29.1, 45, 12.3, 41.3, 54.7, 45.3, 13.3, 9.7, -2, 29.1, 29.5, 31.3, 29.2, 13.8, 26.7, 7.4, 36.8, 42.6, 54.7, 51.3, 42.6, 18, 34, 44.1, 53.6, 44.7, 28.9, 64.9, 60, 66.6, 32.9, 15.5, 37.6, 8.3, 28.5, 16.2, 39.7, 25.9, 8.8, 30.9, 9.9, 39.3, 66.4, 62.4, 53.8, 9.3, 44.7, 50.4, 57.8, 29, 50.1, 28.5, 62.9, 16.3, 54, 45.4, 60.6, 9, 7.7, 64.2, 54.4, 53.3, 45.5, 38, 5.2, 61.7, 10.8, 4.3, 24.8, 26.5, 32.2, 4.5, 49.3, 3.9, 39.6, 26.8, 36.3, 65.1, 59.6, 61.3, 30.1, 65.5, 55.8, 48.2, 49.8, 11.2, 64.2, 29, 44.6, 59.9, 12.6, 51.8, 14.5, 28.8, 49.8, 30.4, 42.7, 2.8, 31.1, 29.2, 27.4, 49.9, 28.2, 59.5, 28.7, 9.4, 30.2, 33.3, 30, 26, 65.1, 55.9, 30.5, 61.1, 50.3, 31.3, 58.2, 41.3, 33.4, 14.8, 51.2, 40.8, 34.1, 33.7, 29.4, 56, 26.4, 30.7, 55.1, 49.7, 37.7, 56.9, 38.5, 28.8, 50.3, 45.7, 13.2, 32.8, 30.5, 30.6, 61.5, 57.7, 33.6, 24.6, 53.9, 36.1, 37.4, 55.5, 27.4, 44.2, 15.4, 56.3, 28.1, 28.8, 67.6, 17.7, 48.5, 57.5, 33.7, 12.9, 19.5, 30.6, 56.8, 75.4, 26, 32.3, 28.3, 10.7, 9, 66.5, 51.6, 30.2, 46, 44.1, 53, 33.9, 28.4, 53.1, 42.3, 55.2, 42.4, 9.4, 36.3, 26.6, 41.2, 33, 42.1, 27, 25.4, 53.8, 56.7, 22.2, 29.5, 30.9, 9.3, 30.4, 48.1, 30.9, 28.4, 38.6, 28.8, 52, 16.5, 64.3, 56.1, 51.4, 50.2, 30.1, 67.3, 62.3, 12.9, 27.9, 38.9, 29.3, 17.4, 30, 62.5, 40.5, 48, 31.9, 54.7, 27.4, 28.2, 46.6, 14, 61.9, 59.4, 65.4, 30.2, 28.9, 35.4, 55.8, 51.4, 47.8, 34, 56.2, 26.5, 30.2, 8.4, 10.9, 63.9, 41.9, 31.3, 52.8, 36, 45.4, -2, 57.3, 80.3, 41, 13.8, 31.9, 33.8, 48.5, 16.7, 29.5, 6.7, 42.1, 32.2, 45.7, 18.9, 30.5, 30.9, 40.2, 14.6, 41.2, 27, 6.1, 34.9, 57.5, 30.1, 56.6, 62.4, 11.5, 25.7, 14.8, 28.2, 43.5, 37.7, 32.1, 44.4, 56.2, 7.6, 29.4, 63.4, 53, 14.6, 50.1, 62.6, 29.3, 33.5, 52.7)
mix_assign <- c(1, 2, 1, 1, 1, 2, 2, 2, 3, 3, 2, 1, 2, 2, 3, 2, 2, 1, 3, 3, 3, 1, 2, 1, 2, 2, 3, 1, 2, 3, 2, 3, 2, 1, 2, 2, 2, 2, 3, 3, 1, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 1, 2, 3, 1, 3, 3, 1, 2, 1, 2, 3, 2, 1, 2, 3, 3, 2, 1, 2, 2, 3, 2, 1, 1, 2, 2, 3, 2, 2, 1, 2, 3, 2, 2, 3, 3, 2, 2, 2, 3, 3, 1, 2, 3, 1, 2, 2, 1, 3, 2, 3, 1, 2, 3, 1, 3, 2, 2, 3, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 3, 2, 2, 2, 3, 2, 1, 1, 2, 2, 1, 2, 3, 3, 2, 3, 1, 3, 3, 2, 2, 1, 3, 3, 1, 2, 2, 2, 2, 2, 1, 2, 3, 2, 2, 2, 2, 3, 1, 3, 3, 2, 2, 3, 1, 2, 3, 2, 2, 2, 2, 2, 3, 2, 3, 1, 3, 3, 3, 3, 2, 3, 2, 2, 2, 1, 2, 2, 1, 1, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 2, 1, 1, 3, 3, 3, 3, 3, 3, 2, 2, 2, 1, 3, 3, 2, 2, 3, 3, 2, 3, 2, 2, 2, 3, 2, 3, 2, 3, 1, 2, 3, 2, 1, 3, 3, 2, 1, 1, 2, 2, 2, 2, 3, 1, 2, 2, 2, 2, 2, 2, 3, 2, 1, 2, 1, 2, 2, 2, 3, 3, 3, 2, 1, 3, 3, 2, 2, 1, 3, 2, 1, 2, 2, 1, 1, 2, 1, 1, 1, 3, 3, 2, 3, 2, 1, 2, 2, 1, 2, 1, 3, 1, 2, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 3, 2, 1, 1, 2, 1, 3, 1, 3, 2, 3, 1, 3, 3, 1, 2, 3, 2, 2, 1, 2, 1, 3, 2, 2, 2, 2, 2, 3, 2, 2, 1, 1, 3, 1, 1, 2, 3, 1, 3, 3, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 2, 3, 1, 1, 2, 3, 2, 1, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2, 3, 3, 2, 3, 2, 2, 2, 2, 2, 3, 2, 2, 3, 3, 1, 1, 1, 2, 2, 2, 3, 1, 2, 2, 2, 3, 1, 2, 2, 2, 1, 1, 1, 3, 3, 2, 3, 2, 3, 2, 2, 1, 3, 2, 2, 3, 2, 3, 2, 3, 1, 3, 1, 3, 2, 2, 3, 1, 3, 2, 3, 2, 2, 1, 1, 2, 2, 1, 3, 3, 1, 2, 3, 2, 3, 2, 3, 1, 2, 2, 2, 2, 3, 1, 2, 2, 3, 2, 2, 2, 3, 2, 2, 3, 1, 3, 3, 2, 2, 2, 1, 2, 2, 2, 1, 3, 2, 3, 2, 2, 3, 3, 2, 2, 3, 3, 3, 2, 3, 3, 2, 1, 2, 2, 3, 2, 2, 3, 1, 2, 2, 2, 2, 3, 2, 1, 3, 3, 2, 2, 2, 2, 2, 3, 2, 3, 1, 3, 1, 3, 3, 2, 2, 1, 1, 2, 2, 2, 3, 2, 1, 3, 2, 2, 1, 3, 2, 3, 2, 2, 2, 1)
mix_assign <- c(mix_assign, 3, 1, 2, 3, 2, 3, 3, 3, 1, 2, 3, 2, 2, 2, 1, 2, 2, 2, 2, 3, 2, 3, 3, 3, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 3, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 3, 2, 3, 2, 3, 3, 3, 2, 3, 2, 3, 1, 2, 1, 2, 2, 2, 1, 3, 2, 2, 2, 3, 2, 3, 2, 1, 3, 2, 3, 2, 1, 2, 1, 2, 2, 3, 2, 1, 2, 3, 2, 2, 2, 3, 3, 2, 2, 2, 2, 3, 2, 2, 1, 2, 2, 3, 1, 1, 2, 2, 3, 3, 3, 1, 1, 3, 2, 2, 2, 3, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 2, 2, 1, 2, 2, 2, 1, 2, 3, 2, 2, 2, 1, 1, 3, 2, 2, 2, 3, 3, 3, 2, 2, 3, 2, 2, 3, 3, 2, 2, 3, 2, 1, 2, 2, 2, 1, 1, 1, 3, 3, 3, 3, 1, 3, 1, 2, 2, 2, 2, 2, 1, 3, 2, 2, 2, 3, 2, 2, 2, 3, 1, 2, 1, 3, 1, 2, 3, 1, 3, 1, 3, 2, 2, 2, 1, 2, 2, 2, 3, 2, 3, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 3, 3, 3, 1, 2, 1, 2, 3, 3, 2, 2, 2, 3, 2, 2, 2, 2, 1, 2, 3, 2, 2, 1, 2, 1, 3, 2, 3, 2, 1, 3, 3, 3, 2, 3, 2, 3, 1, 2, 2, 3, 3, 2, 2, 3, 2, 2, 3, 2, 2, 3, 1, 2, 2, 3, 3, 3, 2, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 1, 3, 3, 3, 2, 2, 3, 3, 2, 2, 2, 2, 3, 2, 1, 2, 3, 3, 2, 1, 2, 2, 2, 1, 1, 3, 2, 2, 3, 3, 3, 1, 1, 2, 2, 3, 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 3, 3, 2, 3, 2, 3, 3, 2, 2, 2, 3, 2, 1, 3, 2, 3, 3, 2, 3, 2, 1, 2, 2, 2, 2, 3, 2, 2, 1, 3, 2, 2, 1, 3, 2, 2, 2, 3, 2, 3, 3, 2, 1, 2, 2, 2, 3, 3, 2, 2, 2, 2, 2, 2, 3, 3, 1, 1, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, 1, 3, 3, 2, 1, 3, 1, 2, 3, 2, 1, 2, 3, 2, 1, 2, 3, 1, 3, 2, 3, 2, 2, 1, 3, 1, 3, 2, 2, 3, 2, 2, 1, 3, 2, 2, 2, 2, 2, 3, 3, 2)
mix_example <- data.frame(x=xExample, assignment=mix_assign)
str(mix_example)
## 'data.frame': 1000 obs. of 2 variables:
## $ x : num 7.3 58.7 9.7 16.9 6.3 35.1 33.5 61.3 28.3 24.3 ...
## $ assignment: num 1 2 1 1 1 2 2 2 3 3 ...
library(flexmix)
## Loading required package: lattice
set.seed(1515)
# If wanting verbose output
# control = list(tolerance = 1e-15, verbose = 1, iter = 1e4)
fit_mix_example <- flexmix(x ~ 1, data = mix_example, k = 3, model = FLXMCnorm1(),
control = list(tolerance = 1e-15, iter = 1e4)
)
proportions <- prior(fit_mix_example)
comp_1 <- parameters(fit_mix_example, component = 1)
comp_2 <- parameters(fit_mix_example, component = 2)
comp_3 <- parameters(fit_mix_example, component = 3)
fun_prop <- function(x, mean, sd, proportion){
proportion * dnorm(x = x, mean = mean, sd = sd)
}
ggplot(mix_example) +
geom_histogram(aes(x = x, y = ..density..)) +
stat_function(geom = "line", fun = fun_prop,
args = list(mean = comp_1[1], sd = comp_1[2], proportion = proportions[1])
) +
stat_function(geom = "line", fun = fun_prop,
args = list(mean = comp_2[1], sd = comp_2[2], proportion = proportions[2])
) +
stat_function(geom = "line", fun = fun_prop,
args = list(mean = comp_3[1], sd = comp_3[2], proportion = proportions[3])
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Explore the first assignments
head(clusters(fit_mix_example))
## [1] 2 1 2 2 2 3
# Explore the first real labels
head(mix_example$assignment)
## [1] 1 2 1 1 1 2
# Create frequency table
table(mix_example$assignment, clusters(fit_mix_example))
##
## 1 2 3
## 1 0 184 1
## 2 464 5 37
## 3 18 2 289
genderData <- readr::read_csv("./RInputFiles/gender.csv")
## Parsed with column specification:
## cols(
## Gender = col_character(),
## Height = col_double(),
## Weight = col_double(),
## BMI = col_double(),
## probability = col_double()
## )
str(genderData)
## Classes 'tbl_df', 'tbl' and 'data.frame': 10000 obs. of 5 variables:
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Height : num 73.8 68.8 74.1 71.7 69.9 ...
## $ Weight : num 242 162 213 220 206 ...
## $ BMI : num 0.0444 0.0343 0.0387 0.0428 0.0423 ...
## $ probability: num 5.78e-06 6.06e-01 2.63e-05 3.63e-04 4.61e-03 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 5
## .. ..$ Gender : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Height : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ Weight : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ BMI : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ probability: list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
set.seed(1313)
fit_with_covariance <- flexmix(cbind(Weight, BMI) ~ 1, data = genderData, k = 2,
model = FLXMCmvnorm(diag = FALSE),
control = list(tolerance = 1e-15, iter.max = 1000)
)
# Get the parameters
comp_1 <- parameters(fit_with_covariance, component = 1)
comp_2 <- parameters(fit_with_covariance, component = 2)
# The means
mean_comp_1 <- comp_1[1:2]
mean_comp_1
## [1] 135.97738684 0.03334097
mean_comp_2 <- comp_2[1:2]
mean_comp_2
## [1] 186.6849545 0.0391035
# The covariance matrices
covariance_comp_1 <- matrix(comp_1[3:6], nrow = 2)
covariance_comp_1
## [,1] [,2]
## [1,] 370.85097459 4.712215e-02
## [2,] 0.04712215 8.103393e-06
covariance_comp_2 <- matrix(comp_2[3:6], nrow = 2)
covariance_comp_2
## [,1] [,2]
## [1,] 405.22840544 2.742036e-02
## [2,] 0.02742036 4.668417e-06
# Create ellipse curve 1
ellipse_comp_1 <- ellipse::ellipse(x = covariance_comp_1, centre = mean_comp_1, npoints = nrow(genderData))
head(ellipse_comp_1)
## x y
## [1,] 181.4301 0.04005980
## [2,] 181.4223 0.04006096
## [3,] 181.4144 0.04006212
## [4,] 181.4065 0.04006327
## [5,] 181.3986 0.04006442
## [6,] 181.3906 0.04006557
# Create ellipse curve 2
ellipse_comp_2 <- ellipse::ellipse(x = covariance_comp_2, centre = mean_comp_2, npoints = nrow(genderData))
head(ellipse_comp_2)
## x y
## [1,] 231.1740 0.04387866
## [2,] 231.1607 0.04388009
## [3,] 231.1473 0.04388151
## [4,] 231.1340 0.04388294
## [5,] 231.1206 0.04388436
## [6,] 231.1072 0.04388578
# Plot the ellipses
genderData %>%
ggplot(aes(x = Weight, y = BMI)) + geom_point()+
geom_path(data = data.frame(ellipse_comp_1), aes(x=x,y=y), col = "red") +
geom_path(data = data.frame(ellipse_comp_2), aes(x=x,y=y), col = "blue")
# Check the assignments
table(genderData$Gender, clusters(fit_with_covariance))
##
## 1 2
## Female 4540 460
## Male 386 4614
Chapter 4 - Mixture Models Beyond Gaussians
Bernoulli Mixture Models:
Bernoulli Mixture Models with flexmix:
Poisson Mixture Models:
Poisson Mixture Models with flexmix:
Example code includes:
# Create the vector of probabilities
p_cluster_1 <- c(0.8, 0.8, 0.2, 0.9)
# Create the sample for each pixel
set.seed(18102308)
pixel_1 <- sample(c(0, 1), 100, replace = TRUE, prob = c(1-p_cluster_1[1], p_cluster_1[1]))
pixel_2 <- sample(c(0, 1), 100, replace = TRUE, prob = c(1-p_cluster_1[2], p_cluster_1[2]))
pixel_3 <- sample(c(0, 1), 100, replace = TRUE, prob = c(1-p_cluster_1[3], p_cluster_1[3]))
pixel_4 <- sample(c(0, 1), 100, replace = TRUE, prob = c(1-p_cluster_1[4], p_cluster_1[4]))
# Combine the samples
sample_cluster_1 <- cbind(pixel_1, pixel_2, pixel_3, pixel_4)
# Have a look to the sample
head(sample_cluster_1)
## pixel_1 pixel_2 pixel_3 pixel_4
## [1,] 1 1 0 1
## [2,] 1 1 0 1
## [3,] 1 1 0 1
## [4,] 1 1 0 1
## [5,] 0 1 0 1
## [6,] 1 1 0 1
digitUse2 <- rowSums(digitKey[, c(1, 3, 10)]) == 1
digits_sample_2 <- digitData[digitUse2, ]
dim(digits_sample_2)
## [1] 478 256
# transform into matrix
digits_sample_2 <- as.matrix(digits_sample_2)
# dimension
dim(digits_sample_2)
## [1] 478 256
# look to the first observation
show_digit(digits_sample_2[1, ])
# look to the last observation
show_digit(digits_sample_2[nrow(digits_sample_2), ])
set.seed(1513)
# Fit Bernoulli mixture model
bernoulli_mix_model <- flexmix(digits_sample_2 ~ 1, k = 3, model = FLXMCmvbinary(),
control = list(tolerance = 1e-15, iter.max = 1000)
)
prior(bernoulli_mix_model)
## [1] 0.3117220 0.3353131 0.3529649
# Extract the parameters for each cluster
param_comp_1 <- parameters(bernoulli_mix_model, component = 1)
param_comp_2 <- parameters(bernoulli_mix_model, component = 2)
param_comp_3 <- parameters(bernoulli_mix_model, component = 3)
# Visualize the clusters
show_digit(param_comp_1)
show_digit(param_comp_2)
show_digit(param_comp_3)
set.seed(1541)
# Create the vector of lambdas
lambda_1 <- c(150, 300, 50)
# Create the sample of each crime
assault_1 <- rpois(n = 10, lambda = lambda_1[1])
robbery_1 <- rpois(n = 10, lambda = lambda_1[2])
battery_1 <- rpois(n = 10, lambda = lambda_1[3])
# Combine the results
cities_1 <- cbind(assault_1, robbery_1, battery_1)
# Check the sample
cities_1
## assault_1 robbery_1 battery_1
## [1,] 154 297 55
## [2,] 142 276 50
## [3,] 166 312 41
## [4,] 158 273 56
## [5,] 129 278 52
## [6,] 150 307 39
## [7,] 140 321 49
## [8,] 152 321 53
## [9,] 143 318 42
## [10,] 125 324 51
crimes <- readr::read_csv("./RInputFiles/CoC_crimes.csv")
## Parsed with column specification:
## cols(
## COMMUNITY = col_character(),
## ASSAULT = col_integer(),
## BATTERY = col_integer(),
## BURGLARY = col_integer(),
## `CRIMINAL DAMAGE` = col_integer(),
## `CRIMINAL TRESPASS` = col_integer(),
## `DECEPTIVE PRACTICE` = col_integer(),
## `MOTOR VEHICLE THEFT` = col_integer(),
## NARCOTICS = col_integer(),
## OTHER = col_integer(),
## `OTHER OFFENSE` = col_integer(),
## ROBBERY = col_integer(),
## THEFT = col_integer()
## )
dim(crimes)
## [1] 77 13
names(crimes) <- stringr::str_replace_all(stringr::str_to_lower(names(crimes)), " ", ".")
# Check with glimpse
glimpse(crimes)
## Observations: 77
## Variables: 13
## $ community <chr> "ALBANY PARK", "ARCHER HEIGHTS", "ARMOUR S...
## $ assault <int> 123, 51, 74, 169, 708, 1198, 118, 135, 337...
## $ battery <int> 429, 134, 184, 448, 1681, 3347, 280, 350, ...
## $ burglary <int> 147, 92, 55, 194, 339, 517, 76, 145, 327, ...
## $ criminal.damage <int> 287, 114, 99, 379, 859, 1666, 150, 310, 52...
## $ criminal.trespass <int> 38, 23, 56, 43, 228, 265, 29, 36, 88, 29, ...
## $ deceptive.practice <int> 137, 67, 59, 178, 310, 767, 73, 200, 314, ...
## $ motor.vehicle.theft <int> 176, 50, 37, 189, 281, 732, 58, 123, 411, ...
## $ narcotics <int> 27, 18, 9, 30, 345, 1456, 15, 22, 119, 10,...
## $ other <int> 107, 37, 48, 114, 584, 1261, 76, 88, 238, ...
## $ other.offense <int> 158, 44, 35, 164, 590, 1130, 94, 142, 339,...
## $ robbery <int> 144, 30, 98, 111, 349, 829, 65, 109, 172, ...
## $ theft <int> 690, 180, 263, 461, 1201, 2137, 239, 669, ...
# Transform into a matrix, without `community`
matrix_crimes <- crimes %>%
select(-community) %>%
as.matrix()
# Check the first values
head(matrix_crimes)
## assault battery burglary criminal.damage criminal.trespass
## [1,] 123 429 147 287 38
## [2,] 51 134 92 114 23
## [3,] 74 184 55 99 56
## [4,] 169 448 194 379 43
## [5,] 708 1681 339 859 228
## [6,] 1198 3347 517 1666 265
## deceptive.practice motor.vehicle.theft narcotics other other.offense
## [1,] 137 176 27 107 158
## [2,] 67 50 18 37 44
## [3,] 59 37 9 48 35
## [4,] 178 189 30 114 164
## [5,] 310 281 345 584 590
## [6,] 767 732 1456 1261 1130
## robbery theft
## [1,] 144 690
## [2,] 30 180
## [3,] 98 263
## [4,] 111 461
## [5,] 349 1201
## [6,] 829 2137
set.seed(2017)
# Fit the Poisson mixture model
poisson_mm <- stepFlexmix(matrix_crimes ~ 1, k = 1:15, nrep = 5, model = FLXMCmvpois(),
control = list(tolerance = 1e-15, iter.max = 1000)
)
## 1 : * * * * *
## 2 : * * * * *
## 3 : * * * * *
## 4 : * * * * *
## 5 : * * * * *
## 6 : * * * * *
## 7 : * * * * *
## 8 : * * * * *
## 9 : * * * * *
## 10 : * * * * *
## 11 : * * * * *
## 12 : * * * * *
## 13 : * * * * *
## 14 : * * * * *
## 15 : * * * * *
# Select the model that minimize the BIC
best_poisson_mm <- getModel(poisson_mm, which = "BIC")
# Get the parameters into a data frame
params_lambdas <- data.frame(parameters(best_poisson_mm))
# Add the column with the type of crime
params_lambdas_crime <- params_lambdas %>%
mutate(crime = colnames(matrix_crimes))
# Plot the clusters with their lambdas
params_lambdas_crime %>%
gather(cluster, lambdas, -crime) %>%
ggplot(aes(x = crime, y = lambdas, fill = crime)) +
geom_bar(stat = "identity") +
facet_wrap(~ cluster) +
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "none")
# Add the cluster assignments
crimes_with_clusters <- crimes %>%
mutate(cluster = factor(clusters(best_poisson_mm)))
# Plot the clusters with the communities
crimes_with_clusters %>%
group_by(cluster) %>%
mutate(number = row_number()) %>%
ggplot(aes(x = cluster, y = number, col = cluster)) +
geom_text(aes(label = community), size = 2.3) +
theme(legend.position="none")
Chapter 1 - The R Package Structure
Introduction to Package Building:
Description and Namespace Files:
Optional Directories:
Example code includes:
# Use the create function to set up your first package
devtools::create("./RPackages/datasummary")
# Take a look at the files and folders in your package
dir("./RPackages/datasummary")
# Create numeric_summary() function
numeric_summary <- function(x, na.rm) {
# Include an error if x is not numeric
if(!is.numeric(x)){
stop("Data must be numeric")
}
# Create data frame
data.frame( min = min(x, na.rm = na.rm),
median = median(x, na.rm = na.rm),
sd = sd(x, na.rm = na.rm),
max = max(x, na.rm = na.rm))
}
data(airquality)
# Test numeric_summary() function
numeric_summary(airquality$Ozone, TRUE)
# What is in the R directory before adding a function?
dir("./RPackages/datasummary/R")
# Use the dump() function to write the numeric_summary function
dump("numeric_summary", file = "./RPackages/datasummary/R/numeric_summary.R")
# Verify that the file is in the correct directory
dir("./RPackages/datasummary/R")
# a package should not have the same name as an existing package and its name must only contain letters, numbers, or dots.
# What is in the package at the moment?
dir("./RPackages/datasummary")
# Add the weather data
data(Weather, package="mosaicData")
devtools::use_data(Weather, pkg = "./RPackages/datasummary")
# Add a vignette called "Generating Summaries with Data Summary"
devtools::use_vignette("Generating_Summaries_with_Data_Summary", pkg = "./RPackages/datasummary")
# What directories do you now have in your package now?
dir("./RPackages/datasummary")
data_summary <- function(x, na.rm = TRUE){
num_data <- select_if(x, .predicate = is.numeric)
map_df(num_data, .f = numeric_summary, na.rm = TRUE, .id = "ID")
}
# Write the function to the R directory
dump("data_summary", file = "./RPackages/datasummary/R/data_summary.R")
dir("./RPackages/datasummary")
Chapter 2 - Documenting Packages
Introduction to roxygen2:
How to export functions:
Documenting other elements:
Documenting a package:
Example code includes:
#' Summary of Numeric Columns
#'
#' Generate specific summaries of numeric columns in a data frame
#'
#' @param x A data frame. Non-numeric columns will be removed
#' @param na.rm A logical indicating whether missing values should be removed
#' @import purrr
#' @import dplyr
#' @importFrom tidyr gather
data_summary <- function(x, na.rm = TRUE){
num_data <- select_if(x, .predicate = is.numeric)
map_df(num_data, .f = numeric_summary, na.rm = na.rm, .id = "ID")
}
#' Summary of Numeric Columns
#'
#' Generate specific summaries of numeric columns in a data frame
#'
#' @param x A data frame. Non-numeric columns will be removed
#' @param na.rm A logical indicating whether missing values should be removed
#' @import dplyr
#' @import purrr
#' @importFrom tidyr gather
#' @export
data_summary <- function(x, na.rm = TRUE){
num_data <- select_if(x, .predicate = is.numeric)
map_df(num_data, .f = numeric_summary, na.rm = na.rm, .id = "ID")
}
#' Data Summary for Numeric Columns
#'
#' Custom summaries of numeric data in a provided data frame
#'
#' @param x A data.frame containing at least one numeric column
#' @param na.rm A logical indicating whether missing values should be removed
#' @import dplyr
#' @import purrr
#' @importFrom tidyr gather
#' @export
#' @examples
#' data_summary(iris)
#' data_summary(airquality, na.rm = FALSE)
data_summary <- function(x, na.rm = TRUE){
num_data <- select_if(x, .predicate = is.numeric)
map_df(num_data, .f = numeric_summary, na.rm = na.rm, .id = "ID")
}
# For code you use \code{text to format}
# To link to other functions you use \link[packageName]{functioName}, although note the package name is only required if the function is not in your package
# To include an unordered list you use \itemize{}. Inside the brakets you mark new items with \item followed by the item text.
#' Data Summary for Numeric Columns
#'
#' Custom summaries of numeric data in a provided data frame
#'
#' @param x A data.frame containing at least one numeric column
#' @param na.rm A logical indicating whether missing values should be removed
#' @import dplyr
#' @import purrr
#' @importFrom tidyr gather
#' @export
#' @examples
#' data_summary(iris)
#' data_summary(airquality, na.rm = FALSE)
#'
## Update the details for the return value
#' @return
#' This function returns a \code{data.frame} including columns:
#' \itemize{
#' \item ID
#' \item min
#' \item median
#' \item sd
#' \item max
#' }
#'
#' @export
data_summary <- function(x, na.rm = TRUE){
num_data <- select_if(x, .predicate = is.numeric)
map_df(num_data, .f = numeric_summary, na.rm = na.rm, .id = "ID")
}
#' Summary of Numeric Columns
#' Generate specific summaries of numeric columns in a data frame
#'
#' @param x A data frame. Non-numeric columns will be removed
#' @param na.rm A logical indicating whether missing values should be removed
#' @import dplyr
#' @import purrr
#' @importFrom tidyr gather
#' @export
#' @examples
#' data_summary(iris)
#' data_summary(airquality, na.rm = FALSE)
#'
#' @return This function returns a \code{data.frame} including columns:
#' \itemize{
#' \item ID
#' \item min
#' \item median
#' \item sd
#' \item max
#' }
#'
## Add in the author of the `data_summary()` function.
#' @author My Name <myemail@example.com>
## Update the header to link to the `summary()` function (in the `base` package).
#' @seealso \link[base]{summary}
data_summary <- function(x, na.rm = TRUE){
num_data <- select_if(x, .predicate = is.numeric)
map_df(num_data, .f = numeric_summary, na.rm = na.rm, .id = "ID")
}
#' Custom Data Summaries
#'
#' Easily generate custom data frame summaries
#'
#' @docType package
#' @name datasummary
_PACKAGE
#' Random Weather Data
#'
#' A dataset containing randomly generated weather data.
#'
#' @format A data frame of 7 rows and 3 columns
#' \describe{
#' \item{Day}{Numeric values giving day of the week, 1 = Monday, 7 = Sunday}
#' \item{Temp}{Integer values giving temperature in degrees Celsius}
#' \item{Weather}{Character values giving precipitation type, Sun if none}
#' }
#' @source Randomly generated data
weather
# Generate package documentation
document("datasummary")
# Examine the contents of the man directory
dir("datasummary/man")
# View the documentation for the data_summary function
help("data_summary")
# View the documentation for the weather dataset
help("weather")
Chapter 3 - Checking and Building R Packages
Why check an R package?
Errors, warnings, and notes:
Differences in package dependencies:
Building packages with continuous integration:
Example code includes:
# Check your package
check("datasummary")
#' Numeric Summaries
#' Summarises numeric data and returns a data frame containing the minimum value, median, standard deviation, and maximum value.
#'
#' @param x a numeric vector containing the values to summarize.
#' @param na.rm a logical value indicating whether NA values should be stripped before the computation proceeds.
numeric_summary <- function(x, na.rm){
if(!is.numeric(x)){
stop("data must be numeric")
}
data.frame( min = min(x, na.rm = na.rm),
median = median(x, na.rm = na.rm),
sd = sd(x, na.rm = na.rm),
max = max(x, na.rm = na.rm))
}
# The way in which you define variables in tidyverse package functions can cause confusion for the R CMD check, which sees column names and the name of your dataset, and flags them as "undefined global variables".
# To get around this, you can manually specify the data and its columns as a vector to utils::globalVariables(), by including a line of code similar to the following in your package-level documentation:
# utils::globalVariables(c("dataset_name", "col_name_1", "col_name_2"))
# This defines dataset_name, col_name_1, and col_name_2 as global variables, and now you shouldn't get the undefined global variables error.
#' datasummary: Custom Data Summaries
#'
#' Easily generate custom data frame summaries
#'
#' @docType package
#' @name datasummary
_PACKAGE
# Update this function call
utils::globalVariables(c("weather", "Temp"))
# Add dplyr as an imported dependency to the DESCRIPTION file
use_package("dplyr", pkg = "datasummary")
# Add purrr as an imported dependency to the DESCRIPTION file
use_package("purrr", pkg = "datasummary")
# Add tidyr as an imported dependency to the DESCRIPTION file
use_package("tidyr", pkg = "datasummary")
# Build the package
build("datasummary")
# Examine the contents of the current directory
dir("datasummary")
Chapter 4 - Adding Unit Tests to R Packages
What are unit tests and why write them?
vec1 not identical to c(1, 2). names for target but not for currentTesting errors and warnings:
Testing specific output and non-exported functions:
Grouping and running tests:
Wrap up:
Example code includes:
# Set up the test framework
use_testthat("datasummary")
# Look at the contents of the package root directory
dir("datasummary")
# Look at the contents of the new folder which has been created
dir("datasummary/tests")
# Create a summary of the iris dataset using your data_summary() function
iris_summary <- data_summary(iris)
# Count how many rows are returned
summary_rows <- nrow(iris_summary)
# Use expect_equal to test that calling data_summary() on iris returns 4 rows
expect_equal(summary_rows, 4)
result <- data_summary(weather)
# Update this test so it passes
expect_equal(result$sd, c(2.1, 3.6), tolerance = 0.1)
expected_result <- list(
ID = c("Day", "Temp"),
min = c(1L, 14L),
median = c(4L, 19L),
sd = c(2.16024689946929, 3.65148371670111),
max = c(7L, 24L)
)
# Write a passing test that compares expected_result to result
expect_equivalent(result, expected_result)
# Create a vector containing the numbers 1 through 10
my_vector <- 1:10
# Look at what happens when we apply this vector as an argument to data_summary()
data_summary(my_vector)
# Test if running data_summary() on this vector returns an error
expect_error(data_summary(my_vector))
# Run data_summary on the airquality dataset with na.rm set to FALSE
data_summary(airquality, na.rm=FALSE)
# Use expect_warning to formally test this
expect_warning(data_summary(airquality, na.rm = FALSE))
# Expected result
expected <- data.frame(min = 14L, median = 19L, sd = 3.65148371670111, max = 24L)
# Create variable result by calling numeric summary on the temp column of the weather dataset
result <- datasummary:::numeric_summary(weather$Temp, na.rm = TRUE)
# Test that the value returned matches the expected value
expect_equal(result, expected)
# Use context() and test_that() to group the tests below together
context("Test data_summary()")
test_that("data_summary() handles errors correctly", {
# Create a vector
my_vector <- 1:10
# Use expect_error()
expect_error(data_summary(my_vector))
# Use expect_warning()
expect_warning(data_summary(airquality, na.rm = FALSE))
})
# Run the tests on the datasummary package
test("datasummary")
Chapter 1 - Evaluating Your Measure with Factor Analysis
Introduction to Exploratory Factor Analysis:
Overview of the Measure Development Process:
Measure Features: Correlations and Reliability:
Example code includes:
# Load the psych package
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:lavaan':
##
## cor2cov
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
gcbs <- readRDS("./RInputFiles/GCBS_data.rds")
glimpse(gcbs)
## Observations: 2,495
## Variables: 15
## $ Q1 <int> 5, 5, 2, 5, 5, 1, 4, 5, 1, 1, 4, 5, 5, 5, 5, 4, 4, 2, 4, 5...
## $ Q2 <int> 5, 5, 4, 4, 4, 1, 3, 4, 1, 2, 4, 5, 4, 4, 4, 4, 4, 1, 2, 2...
## $ Q3 <int> 3, 5, 1, 1, 1, 1, 3, 3, 1, 1, 4, 1, 2, 4, 5, 2, 1, 1, 1, 1...
## $ Q4 <int> 5, 5, 2, 2, 4, 1, 3, 3, 1, 1, 5, 5, 4, 5, 5, 5, 3, 1, 1, 3...
## $ Q5 <int> 5, 5, 2, 4, 4, 1, 4, 4, 1, 1, 5, 5, 5, 4, 5, 5, 4, 1, 1, 4...
## $ Q6 <int> 5, 3, 2, 5, 5, 1, 3, 5, 1, 5, 5, 5, 5, 5, 5, 4, 2, 1, 1, 3...
## $ Q7 <int> 5, 5, 4, 4, 4, 1, 3, 5, 1, 1, 4, 3, 3, 5, 5, 5, 4, 1, 1, 2...
## $ Q8 <int> 3, 5, 2, 1, 3, 1, 4, 5, 1, 1, 4, 1, 3, 5, 5, 1, 1, 1, 1, 3...
## $ Q9 <int> 4, 1, 2, 4, 1, 1, 2, 5, 1, 1, 2, 1, 5, 5, 5, 3, 1, 1, 1, 4...
## $ Q10 <int> 5, 4, 4, 5, 5, 1, 3, 5, 1, 4, 5, 5, 3, 5, 5, 5, 4, 2, 3, 4...
## $ Q11 <int> 5, 4, 2, 5, 5, 1, 3, 5, 1, 1, 4, 5, 4, 5, 4, 4, 4, 2, 2, 4...
## $ Q12 <int> 5, 5, 4, 5, 5, 1, 2, 5, 1, 1, 2, 5, 3, 5, 3, 5, 1, 1, 2, 2...
## $ Q13 <int> 3, 4, 0, 1, 3, 1, 2, 3, 1, 1, 2, 1, 3, 4, 5, 4, 1, 1, 1, 4...
## $ Q14 <int> 5, 4, 2, 4, 5, 1, 3, 4, 1, 1, 1, 5, 3, 4, 5, 5, 4, 1, 2, 4...
## $ Q15 <int> 5, 5, 4, 5, 5, 1, 4, 5, 1, 5, 5, 5, 5, 5, 5, 5, 4, 2, 3, 4...
# Conduct a single-factor EFA
EFA_model <- fa(gcbs)
# View the results
EFA_model
## Factor Analysis using method = minres
## Call: fa(r = gcbs)
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 h2 u2 com
## Q1 0.70 0.49 0.51 1
## Q2 0.72 0.52 0.48 1
## Q3 0.64 0.41 0.59 1
## Q4 0.77 0.59 0.41 1
## Q5 0.67 0.45 0.55 1
## Q6 0.75 0.56 0.44 1
## Q7 0.73 0.54 0.46 1
## Q8 0.65 0.43 0.57 1
## Q9 0.70 0.48 0.52 1
## Q10 0.56 0.32 0.68 1
## Q11 0.72 0.52 0.48 1
## Q12 0.79 0.62 0.38 1
## Q13 0.68 0.46 0.54 1
## Q14 0.74 0.55 0.45 1
## Q15 0.57 0.33 0.67 1
##
## MR1
## SS loadings 7.27
## Proportion Var 0.48
##
## Mean item complexity = 1
## Test of the hypothesis that 1 factor is sufficient.
##
## The degrees of freedom for the null model are 105 and the objective function was 9.31 with Chi Square of 23173.8
## The degrees of freedom for the model are 90 and the objective function was 1.93
##
## The root mean square of the residuals (RMSR) is 0.08
## The df corrected root mean square of the residuals is 0.09
##
## The harmonic number of observations is 2495 with the empirical chi square 3398.99 with prob < 0
## The total number of observations was 2495 with Likelihood Chi Square = 4809.34 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.761
## RMSEA index = 0.145 and the 90 % confidence intervals are 0.142 0.149
## BIC = 4105.36
## Fit based upon off diagonal values = 0.97
## Measures of factor score adequacy
## MR1
## Correlation of (regression) scores with factors 0.97
## Multiple R square of scores with factors 0.94
## Minimum correlation of possible factor scores 0.87
# Set up the single-factor EFA
EFA_model <- fa(gcbs)
# View the factor loadings
EFA_model$loadings
##
## Loadings:
## MR1
## Q1 0.703
## Q2 0.719
## Q3 0.638
## Q4 0.770
## Q5 0.672
## Q6 0.746
## Q7 0.734
## Q8 0.654
## Q9 0.695
## Q10 0.565
## Q11 0.719
## Q12 0.786
## Q13 0.679
## Q14 0.743
## Q15 0.574
##
## MR1
## SS loadings 7.267
## Proportion Var 0.484
# Create a path diagram of the items' factor loadings
fa.diagram(EFA_model)
# Take a look at the first few lines of the response data and their corresponding sum scores
head(gcbs)
## Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11 Q12 Q13 Q14 Q15
## 1 5 5 3 5 5 5 5 3 4 5 5 5 3 5 5
## 2 5 5 5 5 5 3 5 5 1 4 4 5 4 4 5
## 3 2 4 1 2 2 2 4 2 2 4 2 4 0 2 4
## 4 5 4 1 2 4 5 4 1 4 5 5 5 1 4 5
## 5 5 4 1 4 4 5 4 3 1 5 5 5 3 5 5
## 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
rowSums(gcbs[1:6, ])
## 1 2 3 4 5 6
## 68 65 37 55 59 15
# Then look at the first few lines of individuals' factor scores
head(EFA_model$scores)
## MR1
## [1,] 1.5614675
## [2,] 1.3432026
## [3,] -0.3960355
## [4,] 0.7478868
## [5,] 1.0435203
## [6,] -1.7290812
# To get a feel for how the factor scores are distributed, look at their summary statistics and density plot.
summary(EFA_model$scores)
## MR1
## Min. :-1.854703
## 1st Qu.:-0.783260
## Median :-0.001971
## Mean : 0.000000
## 3rd Qu.: 0.728568
## Max. : 1.949580
plot(density(EFA_model$scores, na.rm = TRUE), main = "Factor Scores")
# Basic descriptive statistics
describe(gcbs)
## vars n mean sd median trimmed mad min max range skew kurtosis
## Q1 1 2495 3.47 1.46 4 3.59 1.48 0 5 5 -0.55 -1.10
## Q2 2 2495 2.96 1.49 3 2.96 1.48 0 5 5 -0.01 -1.40
## Q3 3 2495 2.05 1.39 1 1.82 0.00 0 5 5 0.98 -0.44
## Q4 4 2495 2.64 1.45 2 2.55 1.48 0 5 5 0.26 -1.34
## Q5 5 2495 3.25 1.47 4 3.32 1.48 0 5 5 -0.35 -1.27
## Q6 6 2495 3.11 1.51 3 3.14 1.48 0 5 5 -0.17 -1.42
## Q7 7 2495 2.67 1.51 2 2.59 1.48 0 5 5 0.28 -1.39
## Q8 8 2495 2.45 1.57 2 2.32 1.48 0 5 5 0.51 -1.30
## Q9 9 2495 2.23 1.42 2 2.05 1.48 0 5 5 0.76 -0.82
## Q10 10 2495 3.50 1.39 4 3.63 1.48 1 5 4 -0.59 -0.94
## Q11 11 2495 3.27 1.40 4 3.34 1.48 0 5 5 -0.35 -1.11
## Q12 12 2495 2.64 1.50 2 2.56 1.48 0 5 5 0.29 -1.37
## Q13 13 2495 2.10 1.38 1 1.89 0.00 0 5 5 0.89 -0.56
## Q14 14 2495 2.96 1.49 3 2.95 1.48 0 5 5 -0.02 -1.43
## Q15 15 2495 4.23 1.10 5 4.47 0.00 0 5 5 -1.56 1.71
## se
## Q1 0.03
## Q2 0.03
## Q3 0.03
## Q4 0.03
## Q5 0.03
## Q6 0.03
## Q7 0.03
## Q8 0.03
## Q9 0.03
## Q10 0.03
## Q11 0.03
## Q12 0.03
## Q13 0.03
## Q14 0.03
## Q15 0.02
# Graphical representation of error
error.dots(gcbs)
# Graphical representation of error
error.bars(gcbs)
# Establish two sets of indices to split the dataset
N <- nrow(gcbs)
indices <- seq(1, N)
indices_EFA <- sample(indices, floor((.5*N)))
indices_CFA <- indices[!(indices %in% indices_EFA)]
# Use those indices to split the dataset into halves for your EFA and CFA
gcbs_EFA <- gcbs[indices_EFA, ]
gcbs_CFA <- gcbs[indices_CFA, ]
# Use the indices from the previous exercise to create a grouping variable
group_var <- vector("numeric", nrow(gcbs))
group_var[indices_EFA] <- 1
group_var[indices_CFA] <- 2
# Bind that grouping variable onto the gcbs dataset
gcbs_grouped <- cbind(gcbs, group_var)
# Compare stats across groups
describeBy(gcbs_grouped, group = group_var)
##
## Descriptive statistics by group
## group: 1
## vars n mean sd median trimmed mad min max range skew
## Q1 1 1247 3.48 1.44 4 3.59 1.48 0 5 5 -0.54
## Q2 2 1247 2.99 1.48 3 2.99 1.48 0 5 5 -0.02
## Q3 3 1247 2.07 1.38 1 1.84 0.00 0 5 5 0.96
## Q4 4 1247 2.62 1.44 2 2.53 1.48 0 5 5 0.26
## Q5 5 1247 3.23 1.47 4 3.30 1.48 0 5 5 -0.33
## Q6 6 1247 3.13 1.50 3 3.16 1.48 0 5 5 -0.22
## Q7 7 1247 2.66 1.51 2 2.58 1.48 0 5 5 0.29
## Q8 8 1247 2.49 1.57 2 2.37 1.48 0 5 5 0.46
## Q9 9 1247 2.21 1.39 2 2.01 1.48 0 5 5 0.79
## Q10 10 1247 3.51 1.40 4 3.63 1.48 1 5 4 -0.59
## Q11 11 1247 3.30 1.38 4 3.38 1.48 0 5 5 -0.39
## Q12 12 1247 2.63 1.51 2 2.54 1.48 0 5 5 0.31
## Q13 13 1247 2.14 1.40 1 1.94 0.00 0 5 5 0.82
## Q14 14 1247 2.95 1.49 3 2.94 1.48 0 5 5 -0.01
## Q15 15 1247 4.24 1.11 5 4.48 0.00 1 5 4 -1.57
## group_var 16 1247 1.00 0.00 1 1.00 0.00 1 1 0 NaN
## kurtosis se
## Q1 -1.08 0.04
## Q2 -1.37 0.04
## Q3 -0.46 0.04
## Q4 -1.33 0.04
## Q5 -1.28 0.04
## Q6 -1.40 0.04
## Q7 -1.38 0.04
## Q8 -1.34 0.04
## Q9 -0.76 0.04
## Q10 -0.96 0.04
## Q11 -1.05 0.04
## Q12 -1.36 0.04
## Q13 -0.69 0.04
## Q14 -1.41 0.04
## Q15 1.67 0.03
## group_var NaN 0.00
## --------------------------------------------------------
## group: 2
## vars n mean sd median trimmed mad min max range skew
## Q1 1 1248 3.47 1.47 4 3.59 1.48 0 5 5 -0.55
## Q2 2 1248 2.94 1.51 3 2.93 1.48 0 5 5 0.01
## Q3 3 1248 2.03 1.39 1 1.80 0.00 0 5 5 1.00
## Q4 4 1248 2.66 1.46 2 2.57 1.48 0 5 5 0.26
## Q5 5 1248 3.28 1.47 4 3.35 1.48 0 5 5 -0.37
## Q6 6 1248 3.09 1.51 3 3.11 1.48 0 5 5 -0.11
## Q7 7 1248 2.67 1.51 2 2.59 1.48 0 5 5 0.28
## Q8 8 1248 2.41 1.57 2 2.27 1.48 0 5 5 0.57
## Q9 9 1248 2.26 1.45 2 2.08 1.48 0 5 5 0.73
## Q10 10 1248 3.50 1.38 4 3.62 1.48 1 5 4 -0.58
## Q11 11 1248 3.23 1.42 3 3.29 1.48 0 5 5 -0.31
## Q12 12 1248 2.66 1.50 2 2.58 1.48 0 5 5 0.26
## Q13 13 1248 2.06 1.37 1 1.85 0.00 0 5 5 0.95
## Q14 14 1248 2.96 1.49 3 2.95 1.48 0 5 5 -0.03
## Q15 15 1248 4.21 1.10 5 4.45 0.00 0 5 5 -1.56
## group_var 16 1248 2.00 0.00 2 2.00 0.00 2 2 0 NaN
## kurtosis se
## Q1 -1.13 0.04
## Q2 -1.43 0.04
## Q3 -0.43 0.04
## Q4 -1.35 0.04
## Q5 -1.26 0.04
## Q6 -1.44 0.04
## Q7 -1.40 0.04
## Q8 -1.25 0.04
## Q9 -0.89 0.04
## Q10 -0.93 0.04
## Q11 -1.17 0.04
## Q12 -1.37 0.04
## Q13 -0.42 0.04
## Q14 -1.45 0.04
## Q15 1.74 0.03
## group_var NaN 0.00
statsBy(gcbs_grouped, group = "group_var")
## Statistics within and between groups
## Call: statsBy(data = gcbs_grouped, group = "group_var")
## Intraclass Correlation 1 (Percentage of variance due to groups)
## Q1 Q2 Q3 Q4 Q5 Q6 Q7
## 0 0 0 0 0 0 0
## Q8 Q9 Q10 Q11 Q12 Q13 Q14
## 0 0 0 0 0 0 0
## Q15 group_var
## 0 1
## Intraclass Correlation 2 (Reliability of group differences)
## Q1 Q2 Q3 Q4 Q5 Q6 Q7
## -93.70 -0.60 -1.00 -1.07 -0.87 -1.35 -104.83
## Q8 Q9 Q10 Q11 Q12 Q13 Q14
## 0.48 -0.36 -84.52 0.40 -2.47 0.49 -337.47
## Q15 group_var
## -1.93 1.00
## eta^2 between groups
## Q1.bg Q2.bg Q3.bg Q4.bg Q5.bg Q6.bg Q7.bg Q8.bg Q9.bg Q10.bg
## 0 0 0 0 0 0 0 0 0 0
## Q11.bg Q12.bg Q13.bg Q14.bg Q15.bg
## 0 0 0 0 0
##
## To see the correlations between and within groups, use the short=FALSE option in your print statement.
## Many results are not shown directly. To see specific objects select from the following list:
## mean sd n F ICC1 ICC2 ci1 ci2 raw rbg pbg rwg nw pwg etabg etawg nwg nG Call
# Take a look at some correlation data
lowerCor(gcbs, use = "pairwise.complete.obs")
## Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10
## Q1 1.00
## Q2 0.53 1.00
## Q3 0.36 0.40 1.00
## Q4 0.52 0.53 0.50 1.00
## Q5 0.48 0.46 0.40 0.57 1.00
## Q6 0.63 0.55 0.40 0.61 0.50 1.00
## Q7 0.47 0.67 0.42 0.57 0.45 0.54 1.00
## Q8 0.39 0.38 0.78 0.49 0.41 0.41 0.41 1.00
## Q9 0.42 0.49 0.49 0.56 0.46 0.48 0.53 0.48 1.00
## Q10 0.44 0.38 0.32 0.40 0.43 0.41 0.39 0.36 0.37 1.00
## Q11 0.64 0.52 0.34 0.52 0.49 0.62 0.49 0.37 0.46 0.45
## Q12 0.52 0.72 0.44 0.60 0.49 0.59 0.75 0.42 0.57 0.40
## Q13 0.38 0.40 0.71 0.51 0.43 0.42 0.45 0.76 0.54 0.37
## Q14 0.53 0.50 0.43 0.60 0.54 0.55 0.52 0.45 0.55 0.41
## Q15 0.51 0.40 0.27 0.39 0.45 0.47 0.39 0.31 0.32 0.45
## Q11 Q12 Q13 Q14 Q15
## Q11 1.00
## Q12 0.55 1.00
## Q13 0.40 0.49 1.00
## Q14 0.56 0.56 0.50 1.00
## Q15 0.54 0.41 0.30 0.46 1.00
# Take a look at some correlation data
corr.test(gcbs, use = "pairwise.complete.obs")$p
## Q1 Q2 Q3 Q4 Q5
## Q1 0.000000e+00 1.038105e-175 2.525793e-74 1.746323e-174 5.801103e-143
## Q2 1.384140e-177 0.000000e+00 6.236650e-93 8.388030e-183 1.237758e-127
## Q3 3.608276e-75 3.282447e-94 0.000000e+00 1.087371e-155 1.282718e-94
## Q4 2.359896e-176 1.075388e-184 1.647531e-157 0.000000e+00 7.309882e-214
## Q5 1.054746e-144 2.578663e-129 6.108182e-96 8.032837e-216 0.000000e+00
## Q6 1.477903e-277 1.757495e-198 1.248831e-96 3.037754e-253 1.470071e-154
## Q7 8.142449e-139 0.000000e+00 5.318107e-109 5.260693e-212 2.336455e-122
## Q8 1.549786e-91 5.579984e-85 0.000000e+00 1.898345e-150 1.224898e-101
## Q9 4.344797e-106 3.376084e-148 3.104324e-151 3.108940e-210 1.462606e-133
## Q10 3.550942e-116 9.131376e-87 6.760094e-61 1.298818e-98 1.476449e-111
## Q11 9.499179e-292 1.822105e-173 2.580298e-69 9.037626e-175 2.540009e-153
## Q12 9.097129e-175 0.000000e+00 1.005698e-117 1.550592e-244 2.612164e-152
## Q13 9.052542e-87 1.297778e-96 0.000000e+00 1.798859e-167 1.168044e-111
## Q14 1.912163e-184 3.969981e-158 1.103607e-113 8.219433e-248 4.008639e-188
## Q15 9.793389e-162 3.129498e-96 1.676489e-41 1.255385e-93 3.849304e-125
## Q6 Q7 Q8 Q9 Q10
## Q1 1.448345e-275 4.315498e-137 2.634636e-90 1.433783e-104 1.420377e-114
## Q2 1.493871e-196 0.000000e+00 6.695981e-84 1.958129e-146 1.267356e-85
## Q3 2.997195e-95 1.861338e-107 0.000000e+00 1.862595e-149 2.704037e-60
## Q4 2.916244e-251 4.734624e-210 1.120024e-148 2.766957e-208 3.376926e-97
## Q5 9.408454e-153 9.813111e-121 3.429715e-100 7.459290e-132 5.462862e-110
## Q6 0.000000e+00 2.582592e-187 1.111480e-101 4.903141e-145 2.966233e-99
## Q7 3.149502e-189 0.000000e+00 1.110564e-101 4.177923e-178 1.437786e-89
## Q8 3.704932e-103 3.582463e-103 0.000000e+00 3.981155e-142 1.778530e-78
## Q9 8.755609e-147 5.497267e-180 7.372509e-144 0.000000e+00 2.784487e-79
## Q10 1.098605e-100 9.585238e-91 2.223163e-79 3.093874e-80 0.000000e+00
## Q11 4.981130e-268 8.601767e-153 2.643708e-82 1.655959e-131 4.633595e-125
## Q12 2.403960e-231 0.000000e+00 5.764588e-109 7.071640e-216 1.561964e-98
## Q13 1.353745e-109 7.858024e-124 0.000000e+00 2.581509e-186 4.747674e-81
## Q14 7.044729e-195 6.545724e-170 2.485870e-124 2.858794e-201 3.168647e-104
## Q15 1.340087e-134 7.843219e-91 4.328489e-55 4.516727e-61 2.157366e-124
## Q11 Q12 Q13 Q14 Q15
## Q1 9.404187e-290 6.597467e-173 1.267356e-85 1.472366e-182 6.659505e-160
## Q2 1.293694e-171 0.000000e+00 2.997195e-95 2.659887e-156 6.884895e-95
## Q3 1.548179e-68 4.123361e-116 0.000000e+00 4.304066e-112 1.676489e-41
## Q4 6.597467e-173 1.457557e-242 1.241212e-165 7.808462e-246 2.259693e-92
## Q5 1.600206e-151 1.593420e-150 4.438568e-110 3.206912e-186 1.809173e-123
## Q6 4.831696e-266 2.235683e-229 4.873482e-108 5.847125e-193 6.968453e-133
## Q7 5.333096e-151 0.000000e+00 3.378951e-122 4.582007e-168 1.254915e-89
## Q8 2.908079e-81 1.959960e-107 0.000000e+00 1.093783e-122 1.298547e-54
## Q9 8.114201e-130 6.505909e-214 2.039392e-184 2.458563e-199 2.258363e-60
## Q10 2.131454e-123 3.904910e-97 4.747674e-80 1.013967e-102 9.708145e-123
## Q11 0.000000e+00 1.571210e-193 1.584049e-94 1.610933e-205 1.319712e-186
## Q12 1.870488e-195 0.000000e+00 9.395071e-146 1.350081e-206 3.398356e-100
## Q13 7.920246e-96 1.648258e-147 0.000000e+00 6.585550e-154 2.751475e-52
## Q14 1.851647e-207 1.534183e-208 1.013162e-155 0.000000e+00 2.883420e-130
## Q15 1.629274e-188 1.171847e-101 1.375737e-52 5.766839e-132 0.000000e+00
# Take a look at some correlation data
corr.test(gcbs, use = "pairwise.complete.obs")$ci
## lower r upper p
## Q1-Q2 0.4970162 0.5259992 0.5538098 1.384140e-177
## Q1-Q3 0.3206223 0.3553928 0.3892067 3.608276e-75
## Q1-Q4 0.4953852 0.5244323 0.5523079 2.359896e-176
## Q1-Q5 0.4503342 0.4810747 0.5106759 1.054746e-144
## Q1-Q6 0.6071117 0.6313131 0.6543444 1.477903e-277
## Q1-Q7 0.4412058 0.4722710 0.5022057 8.142449e-139
## Q1-Q8 0.3564216 0.3902059 0.4229712 1.549786e-91
## Q1-Q9 0.3850453 0.4179718 0.4498355 4.344797e-106
## Q1-Q10 0.4034438 0.4357865 0.4670415 3.550942e-116
## Q1-Q11 0.6199265 0.6435136 0.6659388 9.499179e-292
## Q1-Q12 0.4932727 0.5224025 0.5503620 9.097129e-175
## Q1-Q13 0.3464313 0.3805006 0.4135673 9.052542e-87
## Q1-Q14 0.5059498 0.5345780 0.5620298 1.912163e-184
## Q1-Q15 0.4753633 0.5051815 0.5338405 9.793389e-162
## Q2-Q3 0.3618855 0.3955108 0.4281083 3.282447e-94
## Q2-Q4 0.5062706 0.5348860 0.5623248 1.075388e-184
## Q2-Q5 0.4259018 0.4574975 0.4879788 2.578663e-129
## Q2-Q6 0.5234810 0.5513960 0.5781285 1.757495e-198
## Q2-Q7 0.6501266 0.6722188 0.6931753 0.000000e+00
## Q2-Q8 0.3425926 0.3767693 0.4099501 5.579984e-85
## Q2-Q9 0.4556319 0.4861810 0.5155863 3.376084e-148
## Q2-Q10 0.3464233 0.3804928 0.4135598 9.131376e-87
## Q2-Q11 0.4915283 0.5207263 0.5487548 1.822105e-173
## Q2-Q12 0.6962013 0.7158851 0.7344931 0.000000e+00
## Q2-Q13 0.3667134 0.4001964 0.4326439 1.297778e-96
## Q2-Q14 0.4702225 0.5002339 0.5290898 3.969981e-158
## Q2-Q15 0.3659505 0.3994560 0.4319274 3.129498e-96
## Q3-Q4 0.4693335 0.4993781 0.5282679 1.647531e-157
## Q3-Q5 0.3653695 0.3988923 0.4313817 6.108182e-96
## Q3-Q6 0.3667467 0.4002287 0.4326752 1.248831e-96
## Q3-Q7 0.3904688 0.4232258 0.4549125 5.318107e-109
## Q3-Q8 0.7683496 0.7839542 0.7986273 0.000000e+00
## Q3-Q9 0.4601647 0.4905484 0.5197845 3.104324e-151
## Q3-Q10 0.2853436 0.3209913 0.3557521 6.760094e-61
## Q3-Q11 0.3066780 0.3418064 0.3760050 2.580298e-69
## Q3-Q12 0.4061739 0.4384278 0.4695906 1.005698e-117
## Q3-Q13 0.6919756 0.7118867 0.7307155 0.000000e+00
## Q3-Q14 0.3989973 0.4314834 0.4628876 1.103607e-113
## Q3-Q15 0.2285790 0.2654400 0.3015410 1.676489e-41
## Q4-Q5 0.5438704 0.5709273 0.5967985 8.032837e-216
## Q4-Q6 0.5837641 0.6090539 0.6331630 3.037754e-253
## Q4-Q7 0.5394959 0.5667395 0.5927977 5.260693e-212
## Q4-Q8 0.4589969 0.4894234 0.5187032 1.898345e-150
## Q4-Q9 0.5374441 0.5647747 0.5909202 3.108940e-210
## Q4-Q10 0.3706739 0.4040387 0.4363621 1.298818e-98
## Q4-Q11 0.4932765 0.5224062 0.5503655 9.037626e-175
## Q4-Q12 0.5749365 0.6006273 0.6251350 1.550592e-244
## Q4-Q13 0.4833700 0.5128834 0.5412322 1.798859e-167
## Q4-Q14 0.5782876 0.6038268 0.6281838 8.219433e-248
## Q4-Q15 0.3607035 0.3943633 0.4269973 1.255385e-93
## Q5-Q6 0.4650551 0.4952588 0.5243108 1.470071e-154
## Q5-Q7 0.4142088 0.4461981 0.4770864 2.336455e-122
## Q5-Q8 0.3765708 0.4097576 0.4418941 1.224898e-101
## Q5-Q9 0.4328328 0.4641904 0.4944261 1.462606e-133
## Q5-Q10 0.3951535 0.4277623 0.4592945 1.476449e-111
## Q5-Q11 0.4632435 0.4935141 0.5226345 2.540009e-153
## Q5-Q12 0.4617541 0.4920795 0.5212560 2.612164e-152
## Q5-Q13 0.3953385 0.4279414 0.4594675 1.168044e-111
## Q5-Q14 0.5106389 0.5390785 0.5663399 4.008639e-188
## Q5-Q15 0.4189383 0.4507697 0.4814945 3.849304e-125
## Q6-Q7 0.5120337 0.5404170 0.5676214 3.149502e-189
## Q6-Q8 0.3794902 0.4125879 0.4446310 3.704932e-103
## Q6-Q9 0.4534992 0.4841255 0.5136099 8.755609e-147
## Q6-Q10 0.3747259 0.4079687 0.4401639 1.098605e-100
## Q6-Q11 0.5981808 0.6228032 0.6462508 4.981130e-268
## Q6-Q12 0.5610551 0.5873651 0.6124896 2.403960e-231
## Q6-Q13 0.3915639 0.4242864 0.4559371 1.353745e-109
## Q6-Q14 0.5190730 0.5471694 0.5740847 7.044729e-195
## Q6-Q15 0.4345044 0.4658040 0.4959800 1.340087e-134
## Q7-Q8 0.3795181 0.4126150 0.4446571 3.582463e-103
## Q7-Q9 0.5001718 0.5290301 0.5567146 5.497267e-180
## Q7-Q10 0.3547857 0.3886172 0.4214323 9.585238e-91
## Q7-Q11 0.4624648 0.4927641 0.5219138 8.601767e-153
## Q7-Q12 0.7365820 0.7540288 0.7704729 0.000000e+00
## Q7-Q13 0.4167211 0.4486267 0.4794284 7.858024e-124
## Q7-Q14 0.4867147 0.5160994 0.5443174 6.545724e-170
## Q7-Q15 0.3549662 0.3887925 0.4216021 7.843219e-91
## Q8-Q9 0.4490408 0.4798277 0.5094765 7.372509e-144
## Q8-Q10 0.3302521 0.3647668 0.3983073 2.223163e-79
## Q8-Q11 0.3367608 0.3710987 0.4044508 2.643708e-82
## Q8-Q12 0.3904041 0.4231631 0.4548520 5.764588e-109
## Q8-Q13 0.7398147 0.7570774 0.7733440 0.000000e+00
## Q8-Q14 0.4175690 0.4494462 0.4802185 2.485870e-124
## Q8-Q15 0.2696028 0.3056115 0.3407668 4.328489e-55
## Q9-Q10 0.3321729 0.3666358 0.4001210 3.093874e-80
## Q9-Q11 0.4294991 0.4609717 0.4913259 1.655959e-131
## Q9-Q12 0.5439334 0.5709876 0.5968561 7.071640e-216
## Q9-Q13 0.5083417 0.5368740 0.5642288 2.581509e-186
## Q9-Q14 0.5268510 0.5546263 0.5812182 2.858794e-201
## Q9-Q15 0.2858045 0.3214413 0.3561903 4.516727e-61
## Q10-Q11 0.4188025 0.4506384 0.4813679 4.633595e-125
## Q10-Q12 0.3705162 0.4038857 0.4362140 1.561964e-98
## Q10-Q13 0.3339871 0.3684007 0.4018335 4.747674e-81
## Q10-Q14 0.3815258 0.4145611 0.4465387 3.168647e-104
## Q10-Q15 0.4176732 0.4495470 0.4803157 2.157366e-124
## Q11-Q12 0.5197817 0.5478491 0.5747350 1.870488e-195
## Q11-Q13 0.3651436 0.3986730 0.4311694 7.920246e-96
## Q11-Q14 0.5342028 0.5616704 0.5879532 1.851647e-207
## Q11-Q15 0.5111332 0.5395529 0.5667941 1.629274e-188
## Q12-Q13 0.4545949 0.4851817 0.5146255 1.648258e-147
## Q12-Q14 0.5354702 0.5628844 0.5891136 1.534183e-208
## Q12-Q15 0.3766079 0.4097936 0.4419289 1.171847e-101
## Q13-Q14 0.4667464 0.4968874 0.5258754 1.013162e-155
## Q13-Q15 0.2625236 0.2986885 0.3340155 1.375737e-52
## Q14-Q15 0.4302457 0.4616926 0.4920203 5.766839e-132
# Estimate coefficient alpha
alpha(gcbs)
##
## Reliability analysis
## Call: alpha(x = gcbs)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.93 0.93 0.94 0.48 14 0.002 2.9 1 0.47
##
## lower alpha upper 95% confidence boundaries
## 0.93 0.93 0.94
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## Q1 0.93 0.93 0.94 0.48 13 0.0021 0.0105 0.46
## Q2 0.93 0.93 0.94 0.48 13 0.0021 0.0099 0.47
## Q3 0.93 0.93 0.94 0.49 13 0.0020 0.0084 0.48
## Q4 0.93 0.93 0.94 0.47 13 0.0022 0.0105 0.46
## Q5 0.93 0.93 0.94 0.48 13 0.0021 0.0112 0.48
## Q6 0.93 0.93 0.94 0.48 13 0.0021 0.0104 0.46
## Q7 0.93 0.93 0.94 0.48 13 0.0021 0.0098 0.47
## Q8 0.93 0.93 0.94 0.48 13 0.0020 0.0086 0.49
## Q9 0.93 0.93 0.94 0.48 13 0.0021 0.0108 0.46
## Q10 0.93 0.93 0.94 0.49 14 0.0020 0.0102 0.49
## Q11 0.93 0.93 0.94 0.48 13 0.0021 0.0104 0.46
## Q12 0.93 0.93 0.94 0.47 13 0.0022 0.0093 0.46
## Q13 0.93 0.93 0.94 0.48 13 0.0021 0.0092 0.48
## Q14 0.93 0.93 0.94 0.48 13 0.0021 0.0109 0.46
## Q15 0.93 0.93 0.94 0.49 14 0.0020 0.0095 0.49
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## Q1 2495 0.73 0.73 0.70 0.68 3.5 1.5
## Q2 2495 0.74 0.74 0.72 0.69 3.0 1.5
## Q3 2495 0.68 0.67 0.66 0.62 2.0 1.4
## Q4 2495 0.78 0.78 0.76 0.74 2.6 1.5
## Q5 2495 0.70 0.70 0.67 0.65 3.3 1.5
## Q6 2495 0.76 0.76 0.74 0.72 3.1 1.5
## Q7 2495 0.75 0.75 0.73 0.70 2.7 1.5
## Q8 2495 0.69 0.69 0.68 0.63 2.5 1.6
## Q9 2495 0.72 0.72 0.69 0.67 2.2 1.4
## Q10 2495 0.61 0.61 0.57 0.55 3.5 1.4
## Q11 2495 0.74 0.74 0.72 0.69 3.3 1.4
## Q12 2495 0.79 0.79 0.79 0.75 2.6 1.5
## Q13 2495 0.71 0.71 0.70 0.66 2.1 1.4
## Q14 2495 0.76 0.76 0.74 0.71 3.0 1.5
## Q15 2495 0.60 0.62 0.58 0.56 4.2 1.1
##
## Non missing response frequency for each item
## 0 1 2 3 4 5 miss
## Q1 0.00 0.16 0.12 0.12 0.27 0.32 0
## Q2 0.01 0.23 0.19 0.16 0.20 0.22 0
## Q3 0.00 0.55 0.13 0.12 0.10 0.10 0
## Q4 0.00 0.32 0.18 0.15 0.20 0.14 0
## Q5 0.00 0.19 0.14 0.13 0.28 0.26 0
## Q6 0.00 0.23 0.15 0.15 0.23 0.24 0
## Q7 0.00 0.33 0.19 0.13 0.18 0.17 0
## Q8 0.00 0.44 0.12 0.14 0.12 0.18 0
## Q9 0.00 0.45 0.19 0.12 0.12 0.11 0
## Q10 0.00 0.14 0.12 0.14 0.30 0.30 0
## Q11 0.00 0.16 0.14 0.19 0.27 0.24 0
## Q12 0.00 0.34 0.18 0.15 0.17 0.17 0
## Q13 0.01 0.51 0.15 0.15 0.10 0.09 0
## Q14 0.00 0.25 0.17 0.15 0.22 0.20 0
## Q15 0.00 0.05 0.05 0.08 0.27 0.55 0
# Calculate split-half reliability
splitHalf(gcbs)
## Split half reliabilities
## Call: splitHalf(r = gcbs)
##
## Maximum split half reliability (lambda 4) = 0.95
## Guttman lambda 6 = 0.94
## Average split half reliability = 0.93
## Guttman lambda 3 (alpha) = 0.93
## Minimum split half reliability (beta) = 0.86
## Average interitem r = 0.48 with median = 0.47
Chapter 2 - Multidimensional EFA
Determining dimensionality:
Understanding multidimensional data:
Investigating model fit:
Example code includes:
data(bfi, package="psych")
glimpse(bfi)
## Observations: 2,800
## Variables: 28
## $ A1 <int> 2, 2, 5, 4, 2, 6, 2, 4, 4, 2, 4, 2, 5, 5, 4, 4, 4, 5...
## $ A2 <int> 4, 4, 4, 4, 3, 6, 5, 3, 3, 5, 4, 5, 5, 5, 5, 3, 6, 5...
## $ A3 <int> 3, 5, 5, 6, 3, 5, 5, 1, 6, 6, 5, 5, 5, 5, 2, 6, 6, 5...
## $ A4 <int> 4, 2, 4, 5, 4, 6, 3, 5, 3, 6, 6, 5, 6, 6, 2, 6, 2, 4...
## $ A5 <int> 4, 5, 4, 5, 5, 5, 5, 1, 3, 5, 5, 5, 4, 6, 1, 3, 5, 5...
## $ C1 <int> 2, 5, 4, 4, 4, 6, 5, 3, 6, 6, 4, 5, 5, 4, 5, 5, 4, 5...
## $ C2 <int> 3, 4, 5, 4, 4, 6, 4, 2, 6, 5, 3, 4, 4, 4, 5, 5, 4, 5...
## $ C3 <int> 3, 4, 4, 3, 5, 6, 4, 4, 3, 6, 5, 5, 3, 4, 5, 5, 4, 5...
## $ C4 <int> 4, 3, 2, 5, 3, 1, 2, 2, 4, 2, 3, 4, 2, 2, 2, 3, 4, 4...
## $ C5 <int> 4, 4, 5, 5, 2, 3, 3, 4, 5, 1, 2, 5, 2, 1, 2, 5, 4, 3...
## $ E1 <int> 3, 1, 2, 5, 2, 2, 4, 3, 5, 2, 1, 3, 3, 2, 3, 1, 1, 2...
## $ E2 <int> 3, 1, 4, 3, 2, 1, 3, 6, 3, 2, 3, 3, 3, 2, 4, 1, 2, 2...
## $ E3 <int> 3, 6, 4, 4, 5, 6, 4, 4, NA, 4, 2, 4, 3, 4, 3, 6, 5, ...
## $ E4 <int> 4, 4, 4, 4, 4, 5, 5, 2, 4, 5, 5, 5, 2, 6, 6, 6, 5, 6...
## $ E5 <int> 4, 3, 5, 4, 5, 6, 5, 1, 3, 5, 4, 4, 4, 5, 5, 4, 5, 6...
## $ N1 <int> 3, 3, 4, 2, 2, 3, 1, 6, 5, 5, 3, 4, 1, 1, 2, 4, 4, 6...
## $ N2 <int> 4, 3, 5, 5, 3, 5, 2, 3, 5, 5, 3, 5, 2, 1, 4, 5, 4, 5...
## $ N3 <int> 2, 3, 4, 2, 4, 2, 2, 2, 2, 5, 4, 3, 2, 1, 2, 4, 4, 5...
## $ N4 <int> 2, 5, 2, 4, 4, 2, 1, 6, 3, 2, 2, 2, 2, 2, 2, 5, 4, 4...
## $ N5 <int> 3, 5, 3, 1, 3, 3, 1, 4, 3, 4, 3, NA, 2, 1, 3, 5, 5, ...
## $ O1 <int> 3, 4, 4, 3, 3, 4, 5, 3, 6, 5, 5, 4, 4, 5, 5, 6, 5, 5...
## $ O2 <int> 6, 2, 2, 3, 3, 3, 2, 2, 6, 1, 3, 6, 2, 3, 2, 6, 1, 1...
## $ O3 <int> 3, 4, 5, 4, 4, 5, 5, 4, 6, 5, 5, 4, 4, 4, 5, 6, 5, 4...
## $ O4 <int> 4, 3, 5, 3, 3, 6, 6, 5, 6, 5, 6, 5, 5, 4, 5, 3, 6, 5...
## $ O5 <int> 3, 3, 2, 5, 3, 1, 1, 3, 1, 2, 3, 4, 2, 4, 5, 2, 3, 4...
## $ gender <int> 1, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1...
## $ education <int> NA, NA, NA, NA, NA, 3, NA, 2, 1, NA, 1, NA, NA, NA, ...
## $ age <int> 16, 18, 17, 17, 17, 21, 18, 19, 19, 17, 21, 16, 16, ...
# Establish two sets of indices to split the dataset
N <- nrow(bfi)
indices <- seq(1, N)
indices_EFA <- sample(indices, floor((.5*N)))
indices_CFA <- indices[!(indices %in% indices_EFA)]
# Use those indices to split the dataset into halves for your EFA and CFA
bfi_EFA <- bfi[indices_EFA, ]
bfi_CFA <- bfi[indices_CFA, ]
# Calculate the correlation matrix first
bfi_EFA_cor <- cor(bfi_EFA, use = "pairwise.complete.obs")
# Then use that correlation matrix to calculate eigenvalues
eigenvals <- eigen(bfi_EFA_cor)
# Look at the eigenvalues returned
eigenvals$values
## [1] 5.0300529 2.8855687 2.1326114 1.8423851 1.5839370 1.3565596 1.1281967
## [8] 0.8876166 0.8368945 0.7759991 0.7411295 0.7223954 0.6965653 0.6688663
## [15] 0.6495947 0.6406188 0.5569057 0.5550641 0.5408830 0.5243398 0.4913524
## [22] 0.4769243 0.4467000 0.4206217 0.4021440 0.3781987 0.3609526 0.2669220
# Then use that correlation matrix to create the scree plot
scree(bfi_EFA_cor, factors = FALSE)
# Run the EFA with six factors (as indicated by your scree plot)
EFA_model <- fa(bfi_EFA, nfactors=6)
## Loading required namespace: GPArotation
# View results from the model object
EFA_model
## Factor Analysis using method = minres
## Call: fa(r = bfi_EFA, nfactors = 6)
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR2 MR5 MR3 MR1 MR4 MR6 h2 u2 com
## A1 0.07 -0.39 0.05 -0.03 0.02 0.51 0.401 0.60 1.9
## A2 0.04 0.60 0.09 -0.04 0.05 -0.19 0.445 0.56 1.3
## A3 -0.04 0.65 0.03 -0.06 0.05 -0.01 0.484 0.52 1.0
## A4 -0.07 0.44 0.19 -0.06 -0.12 0.02 0.267 0.73 1.6
## A5 -0.17 0.59 0.01 -0.12 0.10 0.08 0.494 0.51 1.4
## C1 0.04 0.00 0.54 0.12 0.20 0.10 0.367 0.63 1.4
## C2 0.06 0.12 0.59 0.21 0.11 0.17 0.440 0.56 1.6
## C3 0.01 0.11 0.54 0.12 0.01 0.09 0.316 0.68 1.2
## C4 0.07 0.05 -0.69 0.11 0.04 0.19 0.553 0.45 1.2
## C5 0.11 0.00 -0.57 0.19 0.10 0.01 0.430 0.57 1.4
## E1 -0.13 -0.14 0.07 0.58 -0.10 0.07 0.395 0.61 1.3
## E2 0.07 -0.09 -0.03 0.69 -0.10 0.03 0.593 0.41 1.1
## E3 0.06 0.26 -0.02 -0.30 0.41 0.13 0.471 0.53 2.9
## E4 -0.04 0.38 0.01 -0.48 0.01 0.26 0.567 0.43 2.5
## E5 0.15 0.07 0.24 -0.35 0.31 0.03 0.398 0.60 3.3
## N1 0.80 -0.10 -0.01 -0.09 -0.03 0.04 0.641 0.36 1.1
## N2 0.83 -0.10 0.02 -0.08 0.01 -0.03 0.661 0.34 1.1
## N3 0.70 0.10 -0.07 0.12 0.04 0.02 0.568 0.43 1.1
## N4 0.43 0.08 -0.15 0.38 0.05 -0.04 0.458 0.54 2.3
## N5 0.53 0.22 0.01 0.21 -0.15 0.04 0.419 0.58 1.9
## O1 -0.05 -0.01 0.04 0.00 0.58 0.08 0.358 0.64 1.1
## O2 0.13 0.19 -0.10 0.01 -0.38 0.24 0.254 0.75 2.7
## O3 -0.01 0.06 0.01 -0.07 0.67 0.02 0.507 0.49 1.0
## O4 0.11 0.17 -0.03 0.39 0.35 -0.06 0.279 0.72 2.6
## O5 0.05 0.10 -0.05 -0.01 -0.46 0.28 0.293 0.71 1.8
## gender 0.24 0.23 0.14 -0.11 -0.17 -0.17 0.154 0.85 4.8
## education 0.00 -0.08 0.04 -0.01 0.09 -0.21 0.057 0.94 1.8
## age -0.03 0.04 0.10 -0.11 -0.01 -0.28 0.109 0.89 1.7
##
## MR2 MR5 MR3 MR1 MR4 MR6
## SS loadings 2.58 2.14 2.05 2.01 1.82 0.78
## Proportion Var 0.09 0.08 0.07 0.07 0.07 0.03
## Cumulative Var 0.09 0.17 0.24 0.31 0.38 0.41
## Proportion Explained 0.23 0.19 0.18 0.18 0.16 0.07
## Cumulative Proportion 0.23 0.41 0.59 0.77 0.93 1.00
##
## With factor correlations of
## MR2 MR5 MR3 MR1 MR4 MR6
## MR2 1.00 -0.03 -0.21 0.23 -0.02 0.11
## MR5 -0.03 1.00 0.18 -0.26 0.24 0.03
## MR3 -0.21 0.18 1.00 -0.17 0.22 0.00
## MR1 0.23 -0.26 -0.17 1.00 -0.18 -0.05
## MR4 -0.02 0.24 0.22 -0.18 1.00 0.03
## MR6 0.11 0.03 0.00 -0.05 0.03 1.00
##
## Mean item complexity = 1.8
## Test of the hypothesis that 6 factors are sufficient.
##
## The degrees of freedom for the null model are 378 and the objective function was 7.67 with Chi Square of 10648.54
## The degrees of freedom for the model are 225 and the objective function was 0.6
##
## The root mean square of the residuals (RMSR) is 0.02
## The df corrected root mean square of the residuals is 0.03
##
## The harmonic number of observations is 1375 with the empirical chi square 624.42 with prob < 2.9e-39
## The total number of observations was 1400 with Likelihood Chi Square = 836.09 with prob < 3.8e-71
##
## Tucker Lewis Index of factoring reliability = 0.9
## RMSEA index = 0.044 and the 90 % confidence intervals are 0.041 0.047
## BIC = -793.86
## Fit based upon off diagonal values = 0.98
## Measures of factor score adequacy
## MR2 MR5 MR3 MR1 MR4
## Correlation of (regression) scores with factors 0.93 0.89 0.88 0.89 0.86
## Multiple R square of scores with factors 0.86 0.78 0.78 0.78 0.75
## Minimum correlation of possible factor scores 0.72 0.57 0.56 0.57 0.50
## MR6
## Correlation of (regression) scores with factors 0.75
## Multiple R square of scores with factors 0.56
## Minimum correlation of possible factor scores 0.12
# Run the EFA with six factors (as indicated by your scree plot)
EFA_model <- fa(bfi_EFA, nfactors=6)
# View items' factor loadings
EFA_model$loadings
##
## Loadings:
## MR2 MR5 MR3 MR1 MR4 MR6
## A1 -0.391 0.510
## A2 0.600 -0.192
## A3 0.651
## A4 0.440 0.187 -0.124
## A5 -0.166 0.588 -0.121 0.100
## C1 0.542 0.115 0.198
## C2 0.119 0.591 0.209 0.112 0.170
## C3 0.105 0.539 0.124
## C4 -0.687 0.107 0.191
## C5 0.107 -0.573 0.193 0.102
## E1 -0.125 -0.139 0.576
## E2 0.692
## E3 0.260 -0.296 0.408 0.126
## E4 0.376 -0.479 0.255
## E5 0.151 0.241 -0.354 0.309
## N1 0.803 -0.101
## N2 0.829
## N3 0.695 0.124
## N4 0.434 -0.148 0.382
## N5 0.532 0.216 0.208 -0.152
## O1 0.578
## O2 0.132 0.189 -0.382 0.242
## O3 0.674
## O4 0.108 0.171 0.386 0.353
## O5 0.103 -0.463 0.281
## gender 0.240 0.232 0.143 -0.109 -0.173 -0.168
## education -0.210
## age 0.103 -0.108 -0.276
##
## MR2 MR5 MR3 MR1 MR4 MR6
## SS loadings 2.490 1.965 1.919 1.796 1.715 0.777
## Proportion Var 0.089 0.070 0.069 0.064 0.061 0.028
## Cumulative Var 0.089 0.159 0.228 0.292 0.353 0.381
# View the first few lines of examinees' factor scores
head(EFA_model$scores)
## MR2 MR5 MR3 MR1 MR4 MR6
## 62551 NA NA NA NA NA NA
## 67093 -0.7724195 0.5611340 -0.98862424 -0.35665187 0.4818819 1.16616564
## 62162 -1.1613038 0.7091861 0.01602322 0.25442770 -0.3954765 0.61004704
## 61896 -0.1630297 0.3951015 0.20243968 -0.88470545 0.1207347 0.04158926
## 67438 0.1715825 -1.5105203 -0.13152183 0.09266676 -0.3904054 1.04839765
## 63328 1.3551168 0.5068694 1.21405963 -0.52823218 0.7409618 0.74609206
# Run each theorized EFA on your dataset
bfi_theory <- fa(bfi_EFA, nfactors = 5)
bfi_eigen <- fa(bfi_EFA, nfactors = 6)
# Compare the BIC values
bfi_theory$BIC
## [1] -511.141
bfi_eigen$BIC
## [1] -793.8647
Chapter 3 - Confirmatory Factor Analysis
Setting up CFA:
Understanding the sem() syntax:
Investigating model fit:
Example code includes:
# Conduct a five-factor EFA on the EFA half of the dataset
EFA_model <- fa(bfi_EFA, nfactors = 5)
# Use the wrapper function to create syntax for use with the sem() function
EFA_syn <- structure.sem(EFA_model)
# Set up syntax specifying which items load onto each factor
theory_syn_eq <- "
AGE: A1, A2, A3, A4, A5
CON: C1, C2, C3, C4, C5
EXT: E1, E2, E3, E4, E5
NEU: N1, N2, N3, N4, N5
OPE: O1, O2, O3, O4, O5
"
library(sem)
##
## Attaching package: 'sem'
## The following objects are masked from 'package:lavaan':
##
## cfa, sem
# Feed the syntax in to have variances and covariances automatically added
theory_syn <- cfa(text = theory_syn_eq, reference.indicators = FALSE)
## NOTE: adding 25 variances to the model
# Use the sem() function to run a CFA
theory_CFA <- sem(theory_syn, data = bfi_CFA)
## Warning in sem.semmod(theory_syn, data = bfi_CFA): -289 observations
## removed due to missingness
## Warning in sem.semmod(theory_syn, data = bfi_CFA): The following observed variables are in the input covariance or raw-moment matrix but do not appear in the model:
## gender, education, age
# Use the summary function to view fit information and parameter estimates
summary(theory_CFA)
##
## Model Chisquare = 2005.016 Df = 265 Pr(>Chisq) = 2.150816e-264
## AIC = 2125.016
## BIC = 146.5663
##
## Normalized Residuals
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.8135 -0.5790 0.7005 0.9165 2.3369 9.1189
##
## R-square for Endogenous Variables
## A1 A2 A3 A4 A5 C1 C2 C3 C4 C5
## 0.1177 0.4221 0.5726 0.2652 0.4584 0.3015 0.3895 0.2854 0.4763 0.3806
## E1 E2 E3 E4 E5 N1 N2 N3 N4 N5
## 0.3100 0.4955 0.4090 0.5185 0.3099 0.6809 0.6209 0.5142 0.3434 0.2209
## O1 O2 O3 O4 O5
## 0.3230 0.2294 0.4473 0.0615 0.2625
##
## Parameter Estimates
## Estimate Std Error z value Pr(>|z|)
## lam[A1:AGE] -0.47310342 0.04469539 -10.585060 3.495553e-26 A1 <--- AGE
## lam[A2:AGE] 0.74682312 0.03424190 21.810210 1.856211e-105 A2 <--- AGE
## lam[A3:AGE] 0.97246742 0.03690975 26.347171 5.530212e-153 A3 <--- AGE
## lam[A4:AGE] 0.73031369 0.04413591 16.546928 1.684830e-61 A4 <--- AGE
## lam[A5:AGE] 0.84567717 0.03685897 22.943592 1.707357e-116 A5 <--- AGE
## lam[C1:CON] -0.67102860 0.03893437 -17.234866 1.453902e-66 C1 <--- CON
## lam[C2:CON] -0.81745260 0.04090714 -19.983130 7.722751e-89 C2 <--- CON
## lam[C3:CON] -0.67484497 0.04040327 -16.702731 1.252010e-62 C3 <--- CON
## lam[C4:CON] 0.94244534 0.04196174 22.459632 1.030377e-111 C4 <--- CON
## lam[C5:CON] 1.00338850 0.05089376 19.715355 1.591951e-86 C5 <--- CON
## lam[E1:EXT] 0.88897886 0.04856677 18.304263 7.652070e-75 E1 <--- EXT
## lam[E2:EXT] 1.12282478 0.04596055 24.430185 8.174273e-132 E2 <--- EXT
## lam[E3:EXT] -0.86265804 0.03985228 -21.646390 6.573210e-104 E3 <--- EXT
## lam[E4:EXT] -1.04778146 0.04165946 -25.151108 1.374315e-139 E4 <--- EXT
## lam[E5:EXT] -0.73878674 0.04037337 -18.298864 8.449171e-75 E5 <--- EXT
## lam[N1:NEU] 1.28350486 0.04147064 30.949724 2.562431e-210 N1 <--- NEU
## lam[N2:NEU] 1.20126201 0.04131090 29.078570 6.700813e-186 N2 <--- NEU
## lam[N3:NEU] 1.15489837 0.04498842 25.671017 2.463597e-145 N3 <--- NEU
## lam[N4:NEU] 0.92407395 0.04646996 19.885404 5.444165e-88 N4 <--- NEU
## lam[N5:NEU] 0.76200858 0.04958272 15.368429 2.665543e-53 N5 <--- NEU
## lam[O1:OPE] 0.62901029 0.03848638 16.343713 4.822431e-60 O1 <--- OPE
## lam[O2:OPE] -0.75677801 0.05522087 -13.704565 9.533735e-43 O2 <--- OPE
## lam[O3:OPE] 0.79299695 0.04156934 19.076487 3.959980e-81 O3 <--- OPE
## lam[O4:OPE] 0.30437169 0.04406913 6.906687 4.961028e-12 O4 <--- OPE
## lam[O5:OPE] -0.68830815 0.04680655 -14.705379 5.953631e-49 O5 <--- OPE
## C[AGE,CON] -0.33903475 0.03616712 -9.374116 6.975805e-21 CON <--> AGE
## C[AGE,EXT] -0.71675158 0.02419566 -29.623150 7.522744e-193 EXT <--> AGE
## C[AGE,NEU] -0.23767064 0.03540025 -6.713812 1.896045e-11 NEU <--> AGE
## C[AGE,OPE] 0.23285485 0.04030234 5.777700 7.572854e-09 OPE <--> AGE
## C[CON,EXT] 0.38058118 0.03501044 10.870507 1.593130e-27 EXT <--> CON
## C[CON,NEU] 0.25969556 0.03567111 7.280277 3.331352e-13 NEU <--> CON
## C[CON,OPE] -0.28622586 0.04021418 -7.117536 1.098734e-12 OPE <--> CON
## C[EXT,NEU] 0.27605059 0.03454400 7.991276 1.335494e-15 NEU <--> EXT
## C[EXT,OPE] -0.34443896 0.03822892 -9.009905 2.062345e-19 OPE <--> EXT
## C[NEU,OPE] -0.09901255 0.03959699 -2.500507 1.240156e-02 OPE <--> NEU
## V[A1] 1.67705932 0.07357444 22.794048 5.252890e-115 A1 <--> A1
## V[A2] 0.76354863 0.03963774 19.263174 1.094593e-82 A2 <--> A2
## V[A3] 0.70576489 0.04510292 15.647875 3.435002e-55 A3 <--> A3
## V[A4] 1.47810259 0.06883999 21.471570 2.871544e-102 A4 <--> A4
## V[A5] 0.84511989 0.04553868 18.558285 6.990034e-77 A5 <--> A5
## V[C1] 1.04294084 0.05130818 20.326989 7.421893e-92 C1 <--> C1
## V[C2] 1.04727939 0.05588028 18.741486 2.271949e-78 C2 <--> C2
## V[C3] 1.14007575 0.05541605 20.573025 4.788395e-94 C3 <--> C3
## V[C4] 0.97670448 0.05854026 16.684321 1.704330e-62 C4 <--> C4
## V[C5] 1.63858678 0.08658845 18.923849 7.255663e-80 C5 <--> C5
## V[E1] 1.75863096 0.08303725 21.178820 1.497351e-99 E1 <--> E1
## V[E2] 1.28369574 0.07018375 18.290498 9.851190e-75 E2 <--> E2
## V[E3] 1.07543181 0.05412632 19.868927 7.560203e-88 E3 <--> E3
## V[E4] 1.01968201 0.05734980 17.780043 1.009021e-70 E4 <--> E4
## V[E5] 1.21546936 0.05738608 21.180563 1.442987e-99 E5 <--> E5
## V[N1] 0.77202708 0.05416344 14.253656 4.254140e-46 N1 <--> N1
## V[N2] 0.88100921 0.05399874 16.315365 7.674585e-60 N2 <--> N2
## V[N3] 1.25987596 0.06631127 18.999423 1.724296e-80 N3 <--> N3
## V[N4] 1.63271952 0.07642767 21.362938 2.956188e-101 N4 <--> N4
## V[N5] 2.04833227 0.09152194 22.380779 6.057690e-111 N5 <--> N5
## V[O1] 0.82913181 0.04602848 18.013452 1.527947e-72 O1 <--> O1
## V[O2] 1.92399378 0.09536054 20.175996 1.591273e-90 O2 <--> O2
## V[O3] 0.77711574 0.05441673 14.280824 2.881740e-46 O3 <--> O3
## V[O4] 1.41350151 0.06190555 22.833195 2.146796e-115 O4 <--> O4
## V[O5] 1.33126454 0.06832159 19.485268 1.464112e-84 O5 <--> O5
##
## Iterations = 26
# CAUTION THAT THIS WILL SET GLOBAL OPTIONS
# Set the options to include various fit indices so they will print
origFit <- getOption("fit.indices")
options(fit.indices = c("CFI", "GFI", "RMSEA", "BIC"))
# Use the summary function to view fit information and parameter estimates
summary(theory_CFA)
##
## Model Chisquare = 2005.016 Df = 265 Pr(>Chisq) = 2.150816e-264
## Goodness-of-fit index = 0.8594733
## RMSEA index = 0.07691165 90% CI: (NA, NA)
## Bentler CFI = 0.7863506
## BIC = 146.5663
##
## Normalized Residuals
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.8135 -0.5790 0.7005 0.9165 2.3369 9.1189
##
## R-square for Endogenous Variables
## A1 A2 A3 A4 A5 C1 C2 C3 C4 C5
## 0.1177 0.4221 0.5726 0.2652 0.4584 0.3015 0.3895 0.2854 0.4763 0.3806
## E1 E2 E3 E4 E5 N1 N2 N3 N4 N5
## 0.3100 0.4955 0.4090 0.5185 0.3099 0.6809 0.6209 0.5142 0.3434 0.2209
## O1 O2 O3 O4 O5
## 0.3230 0.2294 0.4473 0.0615 0.2625
##
## Parameter Estimates
## Estimate Std Error z value Pr(>|z|)
## lam[A1:AGE] -0.47310342 0.04469539 -10.585060 3.495553e-26 A1 <--- AGE
## lam[A2:AGE] 0.74682312 0.03424190 21.810210 1.856211e-105 A2 <--- AGE
## lam[A3:AGE] 0.97246742 0.03690975 26.347171 5.530212e-153 A3 <--- AGE
## lam[A4:AGE] 0.73031369 0.04413591 16.546928 1.684830e-61 A4 <--- AGE
## lam[A5:AGE] 0.84567717 0.03685897 22.943592 1.707357e-116 A5 <--- AGE
## lam[C1:CON] -0.67102860 0.03893437 -17.234866 1.453902e-66 C1 <--- CON
## lam[C2:CON] -0.81745260 0.04090714 -19.983130 7.722751e-89 C2 <--- CON
## lam[C3:CON] -0.67484497 0.04040327 -16.702731 1.252010e-62 C3 <--- CON
## lam[C4:CON] 0.94244534 0.04196174 22.459632 1.030377e-111 C4 <--- CON
## lam[C5:CON] 1.00338850 0.05089376 19.715355 1.591951e-86 C5 <--- CON
## lam[E1:EXT] 0.88897886 0.04856677 18.304263 7.652070e-75 E1 <--- EXT
## lam[E2:EXT] 1.12282478 0.04596055 24.430185 8.174273e-132 E2 <--- EXT
## lam[E3:EXT] -0.86265804 0.03985228 -21.646390 6.573210e-104 E3 <--- EXT
## lam[E4:EXT] -1.04778146 0.04165946 -25.151108 1.374315e-139 E4 <--- EXT
## lam[E5:EXT] -0.73878674 0.04037337 -18.298864 8.449171e-75 E5 <--- EXT
## lam[N1:NEU] 1.28350486 0.04147064 30.949724 2.562431e-210 N1 <--- NEU
## lam[N2:NEU] 1.20126201 0.04131090 29.078570 6.700813e-186 N2 <--- NEU
## lam[N3:NEU] 1.15489837 0.04498842 25.671017 2.463597e-145 N3 <--- NEU
## lam[N4:NEU] 0.92407395 0.04646996 19.885404 5.444165e-88 N4 <--- NEU
## lam[N5:NEU] 0.76200858 0.04958272 15.368429 2.665543e-53 N5 <--- NEU
## lam[O1:OPE] 0.62901029 0.03848638 16.343713 4.822431e-60 O1 <--- OPE
## lam[O2:OPE] -0.75677801 0.05522087 -13.704565 9.533735e-43 O2 <--- OPE
## lam[O3:OPE] 0.79299695 0.04156934 19.076487 3.959980e-81 O3 <--- OPE
## lam[O4:OPE] 0.30437169 0.04406913 6.906687 4.961028e-12 O4 <--- OPE
## lam[O5:OPE] -0.68830815 0.04680655 -14.705379 5.953631e-49 O5 <--- OPE
## C[AGE,CON] -0.33903475 0.03616712 -9.374116 6.975805e-21 CON <--> AGE
## C[AGE,EXT] -0.71675158 0.02419566 -29.623150 7.522744e-193 EXT <--> AGE
## C[AGE,NEU] -0.23767064 0.03540025 -6.713812 1.896045e-11 NEU <--> AGE
## C[AGE,OPE] 0.23285485 0.04030234 5.777700 7.572854e-09 OPE <--> AGE
## C[CON,EXT] 0.38058118 0.03501044 10.870507 1.593130e-27 EXT <--> CON
## C[CON,NEU] 0.25969556 0.03567111 7.280277 3.331352e-13 NEU <--> CON
## C[CON,OPE] -0.28622586 0.04021418 -7.117536 1.098734e-12 OPE <--> CON
## C[EXT,NEU] 0.27605059 0.03454400 7.991276 1.335494e-15 NEU <--> EXT
## C[EXT,OPE] -0.34443896 0.03822892 -9.009905 2.062345e-19 OPE <--> EXT
## C[NEU,OPE] -0.09901255 0.03959699 -2.500507 1.240156e-02 OPE <--> NEU
## V[A1] 1.67705932 0.07357444 22.794048 5.252890e-115 A1 <--> A1
## V[A2] 0.76354863 0.03963774 19.263174 1.094593e-82 A2 <--> A2
## V[A3] 0.70576489 0.04510292 15.647875 3.435002e-55 A3 <--> A3
## V[A4] 1.47810259 0.06883999 21.471570 2.871544e-102 A4 <--> A4
## V[A5] 0.84511989 0.04553868 18.558285 6.990034e-77 A5 <--> A5
## V[C1] 1.04294084 0.05130818 20.326989 7.421893e-92 C1 <--> C1
## V[C2] 1.04727939 0.05588028 18.741486 2.271949e-78 C2 <--> C2
## V[C3] 1.14007575 0.05541605 20.573025 4.788395e-94 C3 <--> C3
## V[C4] 0.97670448 0.05854026 16.684321 1.704330e-62 C4 <--> C4
## V[C5] 1.63858678 0.08658845 18.923849 7.255663e-80 C5 <--> C5
## V[E1] 1.75863096 0.08303725 21.178820 1.497351e-99 E1 <--> E1
## V[E2] 1.28369574 0.07018375 18.290498 9.851190e-75 E2 <--> E2
## V[E3] 1.07543181 0.05412632 19.868927 7.560203e-88 E3 <--> E3
## V[E4] 1.01968201 0.05734980 17.780043 1.009021e-70 E4 <--> E4
## V[E5] 1.21546936 0.05738608 21.180563 1.442987e-99 E5 <--> E5
## V[N1] 0.77202708 0.05416344 14.253656 4.254140e-46 N1 <--> N1
## V[N2] 0.88100921 0.05399874 16.315365 7.674585e-60 N2 <--> N2
## V[N3] 1.25987596 0.06631127 18.999423 1.724296e-80 N3 <--> N3
## V[N4] 1.63271952 0.07642767 21.362938 2.956188e-101 N4 <--> N4
## V[N5] 2.04833227 0.09152194 22.380779 6.057690e-111 N5 <--> N5
## V[O1] 0.82913181 0.04602848 18.013452 1.527947e-72 O1 <--> O1
## V[O2] 1.92399378 0.09536054 20.175996 1.591273e-90 O2 <--> O2
## V[O3] 0.77711574 0.05441673 14.280824 2.881740e-46 O3 <--> O3
## V[O4] 1.41350151 0.06190555 22.833195 2.146796e-115 O4 <--> O4
## V[O5] 1.33126454 0.06832159 19.485268 1.464112e-84 O5 <--> O5
##
## Iterations = 26
# Run a CFA using the EFA syntax you created earlier
EFA_CFA <- sem(EFA_syn, data = bfi_CFA)
## Warning in sem.semmod(EFA_syn, data = bfi_CFA): -289 observations removed
## due to missingness
# Locate the BIC in the fit statistics of the summary output
summary(EFA_CFA)$BIC
## [1] 480.1274
# Compare EFA_CFA BIC to the BIC from the CFA based on theory
summary(theory_CFA)$BIC
## [1] 146.5663
# Reset to baseline
options(fit.indices = origFit)
Chapter 4 - Refining Your Measure and Model
EFA vs CFA Revisited:
Adding Loadings to Improve Fit:
Improving Fit by Removing Loadings:
Wrap-Up:
Example code includes:
# CAUTION THAT THIS WILL SET GLOBAL OPTIONS
# Set the options to include various fit indices so they will print
origFit <- getOption("fit.indices")
options(fit.indices = c("CFI", "GFI", "RMSEA", "BIC"))
# View the first five rows of the EFA loadings
EFA_model$loadings[1:5, ]
## MR2 MR3 MR1 MR5 MR4
## A1 0.194806449 0.08286748 -0.150193662 -0.4264281 -0.01822345
## A2 -0.007392792 0.06744147 -0.009306157 0.6261111 0.04454954
## A3 -0.028215628 0.02767518 -0.105034038 0.6441602 0.02841403
## A4 -0.049129621 0.18695432 -0.091956712 0.4276816 -0.14686861
## A5 -0.123184595 0.01709411 -0.191147915 0.5553174 0.06932404
# View the first five loadings from the CFA estimated from the EFA results
summary(EFA_CFA)$coeff[1:5, ]
## Estimate Std Error z value Pr(>|z|)
## F4A1 -0.5184257 0.04528396 -11.44833 2.397187e-30 A1 <--- MR5
## F4A2 0.7768131 0.03524249 22.04195 1.141266e-107 A2 <--- MR5
## F4A3 0.9968771 0.03885030 25.65944 3.317365e-145 A3 <--- MR5
## F4A4 0.7235088 0.04521136 16.00281 1.221362e-57 A4 <--- MR5
## F4A5 0.7768296 0.03870116 20.07251 1.283512e-89 A5 <--- MR5
# Extracting factor scores from the EFA model
EFA_scores <- EFA_model$scores
# Calculating factor scores by applying the CFA parameters to the EFA dataset
CFA_scores <- fscores(EFA_CFA, data = bfi_EFA)
# Comparing factor scores from the EFA and CFA results from the bfi_EFA dataset
plot(density(EFA_scores[,1], na.rm = TRUE),
xlim = c(-3, 3), ylim = c(0, 1), col = "blue")
lines(density(CFA_scores[,1], na.rm = TRUE),
xlim = c(-3, 3), ylim = c(0, 1), col = "red")
# Add some plausible item/factor loadings to the syntax
theory_syn_add <- "
AGE: A1, A2, A3, A4, A5
CON: C1, C2, C3, C4, C5
EXT: E1, E2, E3, E4, E5, N4
NEU: N1, N2, N3, N4, N5, E3
OPE: O1, O2, O3, O4, O5
"
# Convert your equations to sem-compatible syntax
theory_syn2 <- cfa(text = theory_syn_add, reference.indicators = FALSE)
## NOTE: adding 25 variances to the model
# Run a CFA with the revised syntax
theory_CFA_add <- sem(model = theory_syn2, data = bfi_CFA)
## Warning in sem.semmod(model = theory_syn2, data = bfi_CFA): -289
## observations removed due to missingness
## Warning in sem.semmod(model = theory_syn2, data = bfi_CFA): The following observed variables are in the input covariance or raw-moment matrix but do not appear in the model:
## gender, education, age
# Conduct a likelihood ratio test
anova(theory_CFA, theory_CFA_add)
## LR Test for Difference Between Models
##
## Model Df Model Chisq Df LR Chisq Pr(>Chisq)
## theory_CFA 265 2005.0
## theory_CFA_add 263 1901.8 2 103.19 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Compare the comparative fit indices - higher is better!
summary(theory_CFA)$CFI
## [1] 0.7863506
summary(theory_CFA_add)$CFI
## [1] 0.7987748
# Compare the RMSEA values - lower is better!
summary(theory_CFA)$RMSEA
## [1] 0.07691165 NA NA 0.90000000
summary(theory_CFA_add)$RMSEA
## [1] 0.07492514 NA NA 0.90000000
# Compare BIC values
summary(theory_CFA)$BIC
## [1] 146.5663
summary(theory_CFA_add)$BIC
## [1] 57.40664
# Remove the weakest factor loading from the syntax
theory_syn_del <- "
AGE: A1, A2, A3, A4, A5
CON: C1, C2, C3, C4, C5
EXT: E1, E2, E3, E4, E5
NEU: N1, N2, N3, N4, N5
OPE: O1, O2, O3, O5
"
# Convert your equations to sem-compatible syntax
theory_syn3 <- cfa(text = theory_syn_del, reference.indicators = FALSE)
## NOTE: adding 24 variances to the model
# Run a CFA with the revised syntax
theory_CFA_del <- sem(model = theory_syn3, data = bfi_CFA)
## Warning in sem.semmod(model = theory_syn3, data = bfi_CFA): -289
## observations removed due to missingness
## Warning in sem.semmod(model = theory_syn3, data = bfi_CFA): The following observed variables are in the input covariance or raw-moment matrix but do not appear in the model:
## O4, gender, education, age
# Compare the comparative fit indices - higher is better!
summary(theory_CFA)$CFI
## [1] 0.7863506
summary(theory_CFA_del)$CFI
## [1] 0.7983846
# Compare the RMSEA values - lower is better!
summary(theory_CFA)$RMSEA
## [1] 0.07691165 NA NA 0.90000000
summary(theory_CFA_del)$RMSEA
## [1] 0.07732379 NA NA 0.90000000
# Compare BIC values
summary(theory_CFA)$BIC
## [1] 146.5663
summary(theory_CFA_del)$BIC
## [1] 150.9206
# Reset to baseline
options(fit.indices = origFit)
Chapter 1 - GLM - Extension of Regression Toolbox
Limitations of linear models:
Poisson regression:
Basic lm() functions with glm():
Example code includes:
data(ChickWeight, package="datasets")
ChickWeightEnd <- ChickWeight %>%
mutate(Chick=as.factor(as.integer(Chick))) %>%
group_by(Chick) %>%
filter(Time==max(Time), !(Chick %in% c(1, 2, 3, 8, 41))) %>%
ungroup()
glimpse(ChickWeightEnd)
## Observations: 45
## Variables: 4
## $ weight <dbl> 205, 215, 202, 157, 223, 157, 305, 98, 124, 175, 205, 9...
## $ Time <dbl> 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,...
## $ Chick <fct> 15, 17, 14, 11, 18, 12, 20, 5, 7, 13, 16, 4, 19, 9, 10,...
## $ Diet <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2...
# Fit a lm()
lm(formula = weight ~ Diet, data = ChickWeightEnd)
##
## Call:
## lm(formula = weight ~ Diet, data = ChickWeightEnd)
##
## Coefficients:
## (Intercept) Diet2 Diet3 Diet4
## 177.75 36.95 92.55 60.81
# Fit a glm()
glm( formula = weight ~ Diet , data = ChickWeightEnd, family = 'gaussian')
##
## Call: glm(formula = weight ~ Diet, family = "gaussian", data = ChickWeightEnd)
##
## Coefficients:
## (Intercept) Diet2 Diet3 Diet4
## 177.75 36.95 92.55 60.81
##
## Degrees of Freedom: 44 Total (i.e. Null); 41 Residual
## Null Deviance: 225000
## Residual Deviance: 167800 AIC: 507.8
dat <- data.frame(time=1:30,
count=c(0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 2, 0, 1, 0, 0, 1, 0, 0, 0, 2, 2, 1, 1, 4, 1, 1, 1, 1, 0, 0)
)
dat
## time count
## 1 1 0
## 2 2 0
## 3 3 0
## 4 4 0
## 5 5 1
## 6 6 0
## 7 7 0
## 8 8 1
## 9 9 0
## 10 10 0
## 11 11 2
## 12 12 0
## 13 13 1
## 14 14 0
## 15 15 0
## 16 16 1
## 17 17 0
## 18 18 0
## 19 19 0
## 20 20 2
## 21 21 2
## 22 22 1
## 23 23 1
## 24 24 4
## 25 25 1
## 26 26 1
## 27 27 1
## 28 28 1
## 29 29 0
## 30 30 0
# fit y predicted by x with data.frame dat using the poisson family
poissonOut <- glm(count ~ time, data=dat, family="poisson")
# print the output
print(poissonOut)
##
## Call: glm(formula = count ~ time, family = "poisson", data = dat)
##
## Coefficients:
## (Intercept) time
## -1.43036 0.05815
##
## Degrees of Freedom: 29 Total (i.e. Null); 28 Residual
## Null Deviance: 35.63
## Residual Deviance: 30.92 AIC: 66.02
# Fit a glm with count predicted by time using data.frame dat and gaussian family
lmOut <- glm(count ~ time, data=dat, family="gaussian")
summary(lmOut)
##
## Call:
## glm(formula = count ~ time, family = "gaussian", data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2022 -0.5190 -0.1497 0.2595 3.0194
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.09425 0.32891 0.287 0.7766
## time 0.03693 0.01853 1.993 0.0561 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.7714815)
##
## Null deviance: 24.667 on 29 degrees of freedom
## Residual deviance: 21.601 on 28 degrees of freedom
## AIC: 81.283
##
## Number of Fisher Scoring iterations: 2
summary(poissonOut)
##
## Call:
## glm(formula = count ~ time, family = "poisson", data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6547 -0.9666 -0.7226 0.3830 2.3022
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.43036 0.59004 -2.424 0.0153 *
## time 0.05815 0.02779 2.093 0.0364 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 35.627 on 29 degrees of freedom
## Residual deviance: 30.918 on 28 degrees of freedom
## AIC: 66.024
##
## Number of Fisher Scoring iterations: 5
scores <- data.frame(player=rep(c("Sam", "Lou"), each=5),
goal=c(1, 2, 0, 4, 3, 0, 0, 1, 0, 0)
)
scores
## player goal
## 1 Sam 1
## 2 Sam 2
## 3 Sam 0
## 4 Sam 4
## 5 Sam 3
## 6 Lou 0
## 7 Lou 0
## 8 Lou 1
## 9 Lou 0
## 10 Lou 0
# Fit a glm() that estimates the difference between players
summary(glm(goal ~ player, data=scores, family="poisson"))
##
## Call:
## glm(formula = goal ~ player, family = "poisson", data = scores)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0000 -0.6325 -0.6325 0.4934 1.2724
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6094 0.9999 -1.610 0.1075
## playerSam 2.3026 1.0487 2.196 0.0281 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 18.3578 on 9 degrees of freedom
## Residual deviance: 9.8105 on 8 degrees of freedom
## AIC: 26.682
##
## Number of Fisher Scoring iterations: 5
# Fit a glm() that estimates an intercept for each player
summary(glm(goal ~ player - 1, data=scores, family="poisson"))
##
## Call:
## glm(formula = goal ~ player - 1, family = "poisson", data = scores)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0000 -0.6325 -0.6325 0.4934 1.2724
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## playerLou -1.6094 0.9999 -1.610 0.1075
## playerSam 0.6931 0.3162 2.192 0.0284 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 18.4546 on 10 degrees of freedom
## Residual deviance: 9.8105 on 8 degrees of freedom
## AIC: 26.682
##
## Number of Fisher Scoring iterations: 5
dat2 <- data.frame(Date=as.Date("2005-01-09")+1:4368, Number=0L) %>%
mutate(Month=as.factor(lubridate::month(Date)))
eq1 <- c(1, 2, 6, 22, 42, 47, 48, 86, 96, 109, 113, 119, 190, 192, 208, 248, 264, 278, 306, 333, 334, 336, 368, 375, 392, 393, 408, 417, 424, 429, 439, 449, 455, 456, 500, 523, 536, 544, 545, 548, 550, 551, 586, 590, 597, 598, 673, 678, 700, 717, 740, 750, 755, 756, 767, 775, 793, 831, 859, 865, 866, 877, 885, 887, 895, 937, 1086, 1101, 1107, 1111, 1112, 1154, 1157, 1183, 1213, 1235, 1247, 1251, 1269, 1272, 1288, 1295, 1300, 1320, 1342, 1350, 1424, 1454, 1457, 1460, 1476, 1522, 1589, 1598, 1608, 1627, 1642, 1665, 1697, 1709, 1733, 1746, 1749, 1766, 1799, 1830, 1866, 1895, 1914, 1920, 1934, 1942, 1953, 1960, 1961, 1966, 1969, 1989, 2007, 2041, 2051, 2087, 2092, 2096, 2106, 2122, 2129, 2138, 2156, 2159, 2174, 2176, 2177, 2180, 2191, 2214, 2217, 2218, 2251, 2276, 2286, 2302, 2308, 2340, 2352, 2361, 2382, 2416, 2419, 2421, 2464, 2468, 2492, 2522, 2526, 2548, 2550, 2573, 2620, 2625, 2627, 2629, 2698, 2706, 2721, 2726, 2760, 2768, 2787, 2796, 2813, 2854, 2858, 2890, 2900, 2909, 2932, 2933, 2955, 2960, 2966, 2997, 3032, 3057, 3063, 3080, 3090, 3095, 3098, 3122, 3130, 3154, 3160, 3199, 3205, 3215, 3227, 3229, 3243, 3244, 3254, 3302, 3340, 3350, 3469, 3506, 3519, 3525, 3535, 3542, 3584, 3604, 3653, 3660, 3673, 3692, 3694, 3706, 3763, 3792, 3801, 3808, 3812, 3814, 3822, 3884, 3892, 4001, 4084, 4194, 4210, 4220, 4229, 4242, 4265, 4267, 4296, 4302, 4325, 4334, 4338, 4341, 4353, 4354, 4357, 4368)
eq2 <- c(21, 195, 308, 505, 522, 560, 913, 1202, 1353, 1439, 1473, 1484, 1614, 1717, 1808, 1940, 2110, 2391, 2407, 2535, 2716, 2748, 2949, 3313, 3421, 3671, 3967, 3991, 4281)
eq3 <- c(624, 776, 1364, 1585, 2063, 2109, 2196, 2569, 2576, 2607, 3399, 3533, 3607)
eq4 <- c(463, 1918, 2417, 3064, 3606)
eq5 <- c(13, 3826)
eq6 <- c(701, 2097)
eq7 <- c(2509, 4276)
eq9 <- c(1637)
dat2[eq1, "Number"] <- 1L
dat2[eq2, "Number"] <- 2L
dat2[eq3, "Number"] <- 3L
dat2[eq4, "Number"] <- 4L
dat2[eq5, "Number"] <- 5L
dat2[eq6, "Number"] <- 6L
dat2[eq7, "Number"] <- 7L
dat2[eq9, "Number"] <- 9L
str(dat2)
## 'data.frame': 4368 obs. of 3 variables:
## $ Date : Date, format: "2005-01-10" "2005-01-11" ...
## $ Number: int 1 1 0 0 0 1 0 0 0 0 ...
## $ Month : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
table(dat2$Number)
##
## 0 1 2 3 4 5 6 7 9
## 4068 246 29 13 5 2 2 2 1
table(dat2$Month)
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 363 339 372 360 372 360 372 372 360 372 360 366
# build your models
lmOut <- lm(Number ~ Month, data=dat2)
poissonOut <- glm(Number ~ Month, data=dat2, family="poisson")
# examine the outputs using print
print(lmOut)
##
## Call:
## lm(formula = Number ~ Month, data = dat2)
##
## Coefficients:
## (Intercept) Month2 Month3 Month4 Month5
## 0.129477 -0.038031 -0.078401 -0.057254 -0.032702
## Month6 Month7 Month8 Month9 Month10
## -0.043365 -0.005821 -0.051520 -0.023921 -0.054208
## Month11 Month12
## -0.023921 -0.022919
print(poissonOut)
##
## Call: glm(formula = Number ~ Month, family = "poisson", data = dat2)
##
## Coefficients:
## (Intercept) Month2 Month3 Month4 Month5
## -2.0443 -0.3478 -0.9302 -0.5838 -0.2911
## Month6 Month7 Month8 Month9 Month10
## -0.4079 -0.0460 -0.5073 -0.2043 -0.5424
## Month11 Month12
## -0.2043 -0.1948
##
## Degrees of Freedom: 4367 Total (i.e. Null); 4356 Residual
## Null Deviance: 2325
## Residual Deviance: 2303 AIC: 2976
# examine the outputs using summary
summary(lmOut)
##
## Call:
## lm(formula = Number ~ Month, data = dat2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.1295 -0.1056 -0.0914 -0.0753 8.8763
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.129477 0.022770 5.686 1.38e-08 ***
## Month2 -0.038031 0.032767 -1.161 0.2458
## Month3 -0.078401 0.032007 -2.450 0.0143 *
## Month4 -0.057254 0.032269 -1.774 0.0761 .
## Month5 -0.032702 0.032007 -1.022 0.3070
## Month6 -0.043365 0.032269 -1.344 0.1791
## Month7 -0.005821 0.032007 -0.182 0.8557
## Month8 -0.051520 0.032007 -1.610 0.1075
## Month9 -0.023921 0.032269 -0.741 0.4586
## Month10 -0.054208 0.032007 -1.694 0.0904 .
## Month11 -0.023921 0.032269 -0.741 0.4586
## Month12 -0.022919 0.032136 -0.713 0.4758
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4338 on 4356 degrees of freedom
## Multiple R-squared: 0.00249, Adjusted R-squared: -2.927e-05
## F-statistic: 0.9884 on 11 and 4356 DF, p-value: 0.4542
summary(poissonOut)
##
## Call:
## glm(formula = Number ~ Month, family = "poisson", data = dat2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5089 -0.4595 -0.4277 -0.3880 7.7086
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.0443 0.1459 -14.015 < 2e-16 ***
## Month2 -0.3478 0.2314 -1.503 0.132839
## Month3 -0.9302 0.2719 -3.422 0.000623 ***
## Month4 -0.5837 0.2444 -2.388 0.016923 *
## Month5 -0.2911 0.2215 -1.314 0.188706
## Month6 -0.4079 0.2314 -1.763 0.077939 .
## Month7 -0.0460 0.2074 -0.222 0.824486
## Month8 -0.5073 0.2361 -2.149 0.031671 *
## Month9 -0.2043 0.2182 -0.936 0.349112
## Month10 -0.5424 0.2387 -2.272 0.023075 *
## Month11 -0.2043 0.2182 -0.936 0.349112
## Month12 -0.1948 0.2166 -0.899 0.368434
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2325.3 on 4367 degrees of freedom
## Residual deviance: 2302.7 on 4356 degrees of freedom
## AIC: 2975.6
##
## Number of Fisher Scoring iterations: 6
# examine the outputs using tidy
broom::tidy(lmOut)
## # A tibble: 12 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.129 0.0228 5.69 0.0000000138
## 2 Month2 -0.0380 0.0328 -1.16 0.246
## 3 Month3 -0.0784 0.0320 -2.45 0.0143
## 4 Month4 -0.0573 0.0323 -1.77 0.0761
## 5 Month5 -0.0327 0.0320 -1.02 0.307
## 6 Month6 -0.0434 0.0323 -1.34 0.179
## 7 Month7 -0.00582 0.0320 -0.182 0.856
## 8 Month8 -0.0515 0.0320 -1.61 0.108
## 9 Month9 -0.0239 0.0323 -0.741 0.459
## 10 Month10 -0.0542 0.0320 -1.69 0.0904
## 11 Month11 -0.0239 0.0323 -0.741 0.459
## 12 Month12 -0.0229 0.0321 -0.713 0.476
broom::tidy(poissonOut)
## # A tibble: 12 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -2.04 0.146 -14.0 1.27e-44
## 2 Month2 -0.348 0.231 -1.50 1.33e- 1
## 3 Month3 -0.930 0.272 -3.42 6.23e- 4
## 4 Month4 -0.584 0.244 -2.39 1.69e- 2
## 5 Month5 -0.291 0.221 -1.31 1.89e- 1
## 6 Month6 -0.408 0.231 -1.76 7.79e- 2
## 7 Month7 -0.0460 0.207 -0.222 8.24e- 1
## 8 Month8 -0.507 0.236 -2.15 3.17e- 2
## 9 Month9 -0.204 0.218 -0.936 3.49e- 1
## 10 Month10 -0.542 0.239 -2.27 2.31e- 2
## 11 Month11 -0.204 0.218 -0.936 3.49e- 1
## 12 Month12 -0.195 0.217 -0.899 3.68e- 1
# Extract the regression coefficients
coef(poissonOut)
## (Intercept) Month2 Month3 Month4 Month5 Month6
## -2.04425523 -0.34775767 -0.93019964 -0.58375226 -0.29111968 -0.40786159
## Month7 Month8 Month9 Month10 Month11 Month12
## -0.04599723 -0.50734279 -0.20426264 -0.54243411 -0.20426264 -0.19481645
# Extract the confidence intervals
confint(poissonOut)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -2.3444432 -1.77136313
## Month2 -0.8103027 0.10063404
## Month3 -1.4866061 -0.41424128
## Month4 -1.0762364 -0.11342457
## Month5 -0.7311289 0.14051326
## Month6 -0.8704066 0.04053012
## Month7 -0.4542037 0.36161360
## Month8 -0.9807831 -0.05092540
## Month9 -0.6367321 0.22171492
## Month10 -1.0218277 -0.08165226
## Month11 -0.6367321 0.22171492
## Month12 -0.6237730 0.22851779
# use the model to predict with new data
newDat <- data.frame(Month=as.factor(6:8))
predOut <- predict(object = poissonOut, newdata = newDat, type = "response")
# print the predictions
print(predOut)
## 1 2 3
## 0.08611111 0.12365591 0.07795699
Chapter 2 - Logistic Regression
Overview of logistic regression:
Bernoulli vs. Binomial Distribution:
Link functions - probit compared with logit:
Example code includes:
busData <- readr::read_csv("./RInputFiles/busData.csv")
## Parsed with column specification:
## cols(
## CommuteDays = col_integer(),
## MilesOneWay = col_double(),
## Bus = col_character()
## )
bus <- busData %>%
mutate(Bus=factor(Bus, levels=c("No", "Yes")))
glimpse(bus)
## Observations: 15,892
## Variables: 3
## $ CommuteDays <int> 5, 5, 5, 5, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,...
## $ MilesOneWay <dbl> 19.54675, 19.54675, 19.54675, 19.54675, 19.54675, ...
## $ Bus <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, No, No, ...
# Build a glm that models Bus predicted by CommuteDays
# using data.frame bus. Remember to use a binomial family.
busOut <- glm(Bus ~ CommuteDays, data=bus, family="binomial")
# Print the busOut (be sure to use the print() function)
print(busOut)
##
## Call: glm(formula = Bus ~ CommuteDays, family = "binomial", data = bus)
##
## Coefficients:
## (Intercept) CommuteDays
## -1.4549 0.1299
##
## Degrees of Freedom: 15891 Total (i.e. Null); 15890 Residual
## Null Deviance: 19570
## Residual Deviance: 19540 AIC: 19540
# Look at the summary() of busOut
summary(busOut)
##
## Call:
## glm(formula = Bus ~ CommuteDays, family = "binomial", data = bus)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9560 -0.8595 -0.8595 1.5330 1.7668
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.45493 0.11471 -12.683 < 2e-16 ***
## CommuteDays 0.12985 0.02312 5.616 1.96e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19568 on 15891 degrees of freedom
## Residual deviance: 19536 on 15890 degrees of freedom
## AIC: 19540
##
## Number of Fisher Scoring iterations: 4
# Look at the tidy() output of busOut
broom::tidy(busOut)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.45 0.115 -12.7 7.32e-37
## 2 CommuteDays 0.130 0.0231 5.62 1.96e- 8
# Simulate 1 draw with a sample size of 100
binomialSim <- rbinom(n=1, size=100, prob=0.5)
# Simulate 100 draw with a sample size of 1
BernoulliSim <- rbinom(n=100, size=1, prob=0.5)
# Print the results from the binomial
print(binomialSim)
## [1] 47
# Sum the results from the Bernoulli
sum(BernoulliSim)
## [1] 46
dataLong <- data.frame(x=factor(rep(c("a", "b"), each=14), levels=c("a", "b")),
y=factor(c('fail', 'fail', 'fail', 'fail', 'success', 'fail', 'fail', 'fail', 'fail', 'fail', 'fail', 'fail', 'fail', 'success', 'success', 'fail', 'success', 'success', 'success', 'success', 'success', 'success', 'success', 'success', 'success', 'fail', 'success', 'fail'), levels=c("fail", "success"))
)
str(dataLong)
## 'data.frame': 28 obs. of 2 variables:
## $ x: Factor w/ 2 levels "a","b": 1 1 1 1 1 1 1 1 1 1 ...
## $ y: Factor w/ 2 levels "fail","success": 1 1 1 1 2 1 1 1 1 1 ...
# Fit a a long format logistic regression
lr_1 <- glm(y ~ x, data=dataLong, family="binomial")
print(lr_1)
##
## Call: glm(formula = y ~ x, family = "binomial", data = dataLong)
##
## Coefficients:
## (Intercept) xb
## -1.792 3.091
##
## Degrees of Freedom: 27 Total (i.e. Null); 26 Residual
## Null Deviance: 38.67
## Residual Deviance: 26.03 AIC: 30.03
dataWide <- dataLong %>%
group_by(x) %>%
summarize(fail=sum(y=="fail"), success=sum(y=="success"), Total=n(), successProportion = success/Total)
dataWide
## # A tibble: 2 x 5
## x fail success Total successProportion
## <fct> <int> <int> <int> <dbl>
## 1 a 12 2 14 0.143
## 2 b 3 11 14 0.786
# Fit a wide form logistic regression
lr_2 <- glm(cbind(fail, success) ~ x, data=dataWide, family="binomial")
# Fit a a weighted form logistic regression
lr_3 <- glm(successProportion ~ x, weights=Total, data=dataWide, family="binomial")
# print your results
print(lr_2)
##
## Call: glm(formula = cbind(fail, success) ~ x, family = "binomial",
## data = dataWide)
##
## Coefficients:
## (Intercept) xb
## 1.792 -3.091
##
## Degrees of Freedom: 1 Total (i.e. Null); 0 Residual
## Null Deviance: 12.64
## Residual Deviance: -4.441e-16 AIC: 9.215
print(lr_3)
##
## Call: glm(formula = successProportion ~ x, family = "binomial", data = dataWide,
## weights = Total)
##
## Coefficients:
## (Intercept) xb
## -1.792 3.091
##
## Degrees of Freedom: 1 Total (i.e. Null); 0 Residual
## Null Deviance: 12.64
## Residual Deviance: 4.441e-15 AIC: 9.215
# Fit a GLM with a logit link and save it as busLogit
busLogit <- glm(Bus ~ CommuteDays, data = bus, family = binomial(link = "logit"))
# Fit a GLM with probit link and save it as busProbit
busProbit <- glm(Bus ~ CommuteDays, data = bus, family = binomial(link = "probit"))
# Print model summaries
summary(busLogit)
##
## Call:
## glm(formula = Bus ~ CommuteDays, family = binomial(link = "logit"),
## data = bus)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9560 -0.8595 -0.8595 1.5330 1.7668
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.45493 0.11471 -12.683 < 2e-16 ***
## CommuteDays 0.12985 0.02312 5.616 1.96e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19568 on 15891 degrees of freedom
## Residual deviance: 19536 on 15890 degrees of freedom
## AIC: 19540
##
## Number of Fisher Scoring iterations: 4
summary(busProbit)
##
## Call:
## glm(formula = Bus ~ CommuteDays, family = binomial(link = "probit"),
## data = bus)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9545 -0.8596 -0.8596 1.5328 1.7706
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.88951 0.06833 -13.017 < 2e-16 ***
## CommuteDays 0.07810 0.01380 5.658 1.53e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19568 on 15891 degrees of freedom
## Residual deviance: 19536 on 15890 degrees of freedom
## AIC: 19540
##
## Number of Fisher Scoring iterations: 4
# Convert from the logit scale to a probability
p <- dlogis(0)
# Simulate a logit
rbinom(n=10, size=1, prob=p)
## [1] 0 0 1 0 0 0 0 0 0 0
# Convert from the probit scale to a probability
p <- pnorm(0)
# Simulate a probit
rbinom(n=10, size=1, prob=p)
## [1] 0 0 0 1 0 0 1 0 1 1
Chapter 3 - Interpreting and Visualizing GLMs
Poisson Regression Coefficients:
Plotting Poisson Regression:
Understanding output from logistic regression:
ggplot2 and binomial regression:
Example code includes:
# extract the coeffients from lmOut
(lmCoef <- coef(lmOut))
## (Intercept) Month2 Month3 Month4 Month5 Month6
## 0.12947658 -0.03803116 -0.07840132 -0.05725436 -0.03270239 -0.04336547
## Month7 Month8 Month9 Month10 Month11 Month12
## -0.00582067 -0.05151959 -0.02392103 -0.05420777 -0.02392103 -0.02291921
# extract the coefficients from poisosnOut
(poissonCoef <- coef(poissonOut))
## (Intercept) Month2 Month3 Month4 Month5 Month6
## -2.04425523 -0.34775767 -0.93019964 -0.58375226 -0.29111968 -0.40786159
## Month7 Month8 Month9 Month10 Month11 Month12
## -0.04599723 -0.50734279 -0.20426264 -0.54243411 -0.20426264 -0.19481645
# take the exponetial using exp()
(poissonCoefExp <- exp(poissonCoef))
## (Intercept) Month2 Month3 Month4 Month5 Month6
## 0.1294766 0.7062700 0.3944749 0.5578014 0.7474262 0.6650709
## Month7 Month8 Month9 Month10 Month11 Month12
## 0.9550446 0.6020933 0.8152482 0.5813315 0.8152482 0.8229857
# This is because the Poisson coefficients are multiplicative
# Notice that 0.129 * 0.706 = 0.091 from the Poisson coefficents is the same as 0.129-0.038 = 0.091 from the linear model
cellData <- data.frame(dose=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10),
cells=c(1, 0, 0, 0, 0, 2, 0, 1, 2, 0, 3, 0, 2, 2, 1, 0, 1, 2, 2, 2, 2, 3, 5, 3, 0, 3, 6, 2, 4, 4, 2, 2, 8, 4, 4, 4, 7, 2, 6, 5, 2, 5, 8, 4, 7, 4, 4, 7, 9, 3, 6, 7, 9, 5, 3, 5, 5, 3, 4, 11, 2, 7, 9, 3, 4, 2, 6, 5, 5, 6, 4, 5, 8, 10, 11, 9, 8, 8, 11, 7, 10, 12, 9, 12, 10, 12, 9, 17, 6, 9, 15, 11, 11, 10, 4, 9, 13, 8, 8, 13)
)
# Use geom_smooth to plot a continuous predictor variable
ggplot(data = cellData, aes(x = dose, y = cells)) +
geom_jitter(width = 0.05, height = 0.05) +
geom_smooth(method = 'glm', method.args = list(family = 'poisson'))
# Extract out the coefficients
coefOut <- coef(busOut)
# Convert the coefficients to odds-ratios
exp(coefOut)
## (Intercept) CommuteDays
## 0.2334164 1.1386623
# use tidy on busOut and exponentiate the results and extract the confidence interval
broom::tidy(busOut, exponentiate=TRUE, conf.int=TRUE)
## # A tibble: 2 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.233 0.115 -12.7 7.32e-37 0.186 0.292
## 2 CommuteDays 1.14 0.0231 5.62 1.96e- 8 1.09 1.19
str(bus)
## Classes 'tbl_df', 'tbl' and 'data.frame': 15892 obs. of 3 variables:
## $ CommuteDays: int 5 5 5 5 3 4 5 5 5 5 ...
## $ MilesOneWay: num 19.5 19.5 19.5 19.5 19.5 ...
## $ Bus : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 1 1 1 ...
bus <- bus %>%
mutate(Bus2 = as.integer(Bus)-1)
str(bus)
## Classes 'tbl_df', 'tbl' and 'data.frame': 15892 obs. of 4 variables:
## $ CommuteDays: int 5 5 5 5 3 4 5 5 5 5 ...
## $ MilesOneWay: num 19.5 19.5 19.5 19.5 19.5 ...
## $ Bus : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 1 1 1 ...
## $ Bus2 : num 1 1 1 1 1 1 0 0 0 0 ...
# add in the missing parts of the ggplot
ggJitter <- ggplot(data = bus, aes(x = MilesOneWay, y = Bus2)) +
geom_jitter(width = 0, height = 0.05)
# add in geom_smooth()
ggJitter + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# add in the missing parts of the ggplot
ggJitter + geom_smooth(method = "glm" , method.args = list(family="binomial"))
# add in the missing parts of the ggplot
ggJitter +
geom_smooth(method = 'glm', method.args = list(family = binomial(link="probit")),
color = 'red', se = FALSE
) +
geom_smooth(method = 'glm', method.args = list(family = binomial(link="logit")),
color = 'blue', se = FALSE
)
Chapter 4 - Multiple Regression with GLMs
Multiple logistic regression:
Formulas in R:
Assumptions of multiple logistic regression:
Wrap up:
Example code includes:
# Build a logistic regression with Bus predicted by CommuteDays and MilesOneWay
busBoth <- glm(Bus ~ CommuteDays + MilesOneWay, data=bus, family="binomial")
# Look at the summary of the output
summary(busBoth)
##
## Call:
## glm(formula = Bus ~ CommuteDays + MilesOneWay, family = "binomial",
## data = bus)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0732 -0.9035 -0.7816 1.3968 2.5066
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.707515 0.119719 -5.910 3.42e-09 ***
## CommuteDays 0.066084 0.023181 2.851 0.00436 **
## MilesOneWay -0.059571 0.003218 -18.512 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19568 on 15891 degrees of freedom
## Residual deviance: 19137 on 15889 degrees of freedom
## AIC: 19143
##
## Number of Fisher Scoring iterations: 4
# Build a logistic regression with Bus predicted by CommuteDays
busDays <- glm(Bus ~ CommuteDays, data=bus, family="binomial")
# Build a logistic regression with Bus predicted by MilesOneWay
busMiles <- glm(Bus ~ MilesOneWay, data=bus, family="binomial")
# Build a glm with CommuteDays first and MilesOneWay second
busOne <- glm(Bus ~ CommuteDays + MilesOneWay, data=bus, family="binomial")
# Build a glm with MilesOneWay first and CommuteDays second
busTwo <- glm(Bus ~ MilesOneWay + CommuteDays, data=bus, family="binomial")
# Print model summaries
summary(busOne)
##
## Call:
## glm(formula = Bus ~ CommuteDays + MilesOneWay, family = "binomial",
## data = bus)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0732 -0.9035 -0.7816 1.3968 2.5066
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.707515 0.119719 -5.910 3.42e-09 ***
## CommuteDays 0.066084 0.023181 2.851 0.00436 **
## MilesOneWay -0.059571 0.003218 -18.512 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19568 on 15891 degrees of freedom
## Residual deviance: 19137 on 15889 degrees of freedom
## AIC: 19143
##
## Number of Fisher Scoring iterations: 4
summary(busTwo)
##
## Call:
## glm(formula = Bus ~ MilesOneWay + CommuteDays, family = "binomial",
## data = bus)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0732 -0.9035 -0.7816 1.3968 2.5066
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.707515 0.119719 -5.910 3.42e-09 ***
## MilesOneWay -0.059571 0.003218 -18.512 < 2e-16 ***
## CommuteDays 0.066084 0.023181 2.851 0.00436 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19568 on 15891 degrees of freedom
## Residual deviance: 19137 on 15889 degrees of freedom
## AIC: 19143
##
## Number of Fisher Scoring iterations: 4
size <- c(1.1, 2.2, 3.3)
count <- c(10, 4, 2)
# use model matrix with size
model.matrix(~ size)
## (Intercept) size
## 1 1 1.1
## 2 1 2.2
## 3 1 3.3
## attr(,"assign")
## [1] 0 1
# use model matirx with count
model.matrix(~ size + count)
## (Intercept) size count
## 1 1 1.1 10
## 2 1 2.2 4
## 3 1 3.3 2
## attr(,"assign")
## [1] 0 1 2
color <- c("red", "blue", "green")
# create a matrix that includes a reference intercept
model.matrix(~ color)
## (Intercept) colorgreen colorred
## 1 1 0 1
## 2 1 0 0
## 3 1 1 0
## attr(,"assign")
## [1] 0 1 1
## attr(,"contrasts")
## attr(,"contrasts")$color
## [1] "contr.treatment"
# create a matrix that includes an intercept for each group
model.matrix(~ color - 1)
## colorblue colorgreen colorred
## 1 0 0 1
## 2 1 0 0
## 3 0 1 0
## attr(,"assign")
## [1] 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$color
## [1] "contr.treatment"
shape <- c("square", "square", "circle")
# create a matrix that includes color and shape
model.matrix(~ color + shape - 1)
## colorblue colorgreen colorred shapesquare
## 1 0 0 1 1
## 2 1 0 0 1
## 3 0 1 0 0
## attr(,"assign")
## [1] 1 1 1 2
## attr(,"contrasts")
## attr(,"contrasts")$color
## [1] "contr.treatment"
##
## attr(,"contrasts")$shape
## [1] "contr.treatment"
# create a matrix that includes shape and color
model.matrix(~ shape + color - 1)
## shapecircle shapesquare colorgreen colorred
## 1 0 1 0 1
## 2 0 1 0 0
## 3 1 0 1 0
## attr(,"assign")
## [1] 1 1 2 2
## attr(,"contrasts")
## attr(,"contrasts")$shape
## [1] "contr.treatment"
##
## attr(,"contrasts")$color
## [1] "contr.treatment"
data("UCBAdmissions", package="datasets")
UCBdata <- as.data.frame(UCBAdmissions) %>%
mutate(Gender=factor(Gender, levels=c("Female", "Male")), Dept=factor(Dept, levels=LETTERS[1:6])) %>%
tidyr::spread(Admit, Freq) %>%
arrange(Dept, Gender)
# build a binomial glm where Admitted and Rejected are predicted by Gender
glm1 <- glm(cbind(Admitted, Rejected) ~ Gender, data=UCBdata, family="binomial")
summary(glm1)
##
## Call:
## glm(formula = cbind(Admitted, Rejected) ~ Gender, family = "binomial",
## data = UCBdata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -16.7915 -4.7613 -0.4365 5.1025 11.2022
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.83049 0.05077 -16.357 <2e-16 ***
## GenderMale 0.61035 0.06389 9.553 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 877.06 on 11 degrees of freedom
## Residual deviance: 783.61 on 10 degrees of freedom
## AIC: 856.55
##
## Number of Fisher Scoring iterations: 4
# build a binomial glm where Admitted and Rejected are predicted by Gender and Dept
glm2 <- glm(cbind(Admitted, Rejected) ~ Gender + Dept, data=UCBdata, family="binomial")
summary(glm2)
##
## Call:
## glm(formula = cbind(Admitted, Rejected) ~ Gender + Dept, family = "binomial",
## data = UCBdata)
##
## Deviance Residuals:
## 1 2 3 4 5 6 7 8
## 3.7189 -1.2487 0.2706 -0.0560 -0.9243 1.2533 -0.0858 0.0826
## 9 10 11 12
## -0.8509 1.2205 0.2052 -0.2076
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.68192 0.09911 6.880 5.97e-12 ***
## GenderMale -0.09987 0.08085 -1.235 0.217
## DeptB -0.04340 0.10984 -0.395 0.693
## DeptC -1.26260 0.10663 -11.841 < 2e-16 ***
## DeptD -1.29461 0.10582 -12.234 < 2e-16 ***
## DeptE -1.73931 0.12611 -13.792 < 2e-16 ***
## DeptF -3.30648 0.16998 -19.452 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 877.056 on 11 degrees of freedom
## Residual deviance: 20.204 on 5 degrees of freedom
## AIC: 103.14
##
## Number of Fisher Scoring iterations: 4
# Add a non-linear equation to a geom_smooth
ggJitter +
geom_smooth(method = 'glm', method.args = list(family = 'binomial'), formula = y~I(x^2), color = 'red')
Chapter 1 - What is Bioconductor?
Introduction to the Bioconductor Project:
Role of S4 in Bioconductor:
Biology of Genomic Datasets:
Example code includes:
# Load the BiocInstaller package
library(BiocInstaller)
# Explicit syntax to check the Bioconductor version
BiocInstaller::biocVersion()
# When BiocInstaller is loaded use biocVersion alone
biocVersion()
# Load the BSgenome package
library(BSgenome)
# Check the version of the BSgenome package
packageVersion("BSgenome")
# Investigate about the a_genome using show()
# show(a_genome)
# Investigate some other accesors
# organism(a_genome)
# provider(a_genome)
# seqinfo(a_genome)
# Load the yeast genome
library(BSgenome.Scerevisiae.UCSC.sacCer3)
# Assign data to the yeastGenome object
yeastGenome <- BSgenome.Scerevisiae.UCSC.sacCer3
# Get the head of seqnames and tail of seqlengths for yeastGenome
head(seqnames(yeastGenome))
tail(seqlengths(yeastGenome))
# Select chromosome M, alias chrM
yeastGenome$chrM
# Count characters of the chrM sequence
nchar(yeastGenome$chrM)
# Assign data to the yeastGenome object
yeastGenome <- BSgenome.Scerevisiae.UCSC.sacCer3
# Get the first 30 bases of each chromosome
getSeq(yeastGenome, start=1, end=30)
Chapter 2 - Biostrings and When to Use Them
Introduction to Biostrings:
Sequence handling:
Why we are interested in patterns:
Example code includes:
# Load packages
library(Biostrings)
# Check the alphabet of the zikaVirus
alphabet(zikaVirus)
# Check the alphabetFrequency of the zikaVirus
alphabetFrequency(zikaVirus)
# Check alphabet of the zikaVirus using baseOnly = TRUE
alphabet(zikaVirus, baseOnly = TRUE)
# Unlist the set and select the first 21 letters as dna_seq, then print it
dna_seq <- DNAString(subseq(as.character(zikaVirus), end = 21))
dna_seq
# 1.1 Transcribe dna_seq as rna_seq, then print it
rna_seq <- RNAString(dna_seq)
rna_seq
# 1.2 Translate rna_seq as aa_seq, then print it
aa_seq <- translate(rna_seq)
aa_seq
# 2.1 Translate dna_seq as aa_seq_2, then print it
aa_seq_2 <- translate(dna_seq)
aa_seq_2
# Create zikv with one collated sequence using `zikaVirus`
zikv <- unlist(zikaVirus)
# Check the length of zikaVirus and zikv
length(zikaVirus)
length(zikv)
# Check the width of zikaVirus
width(zikaVirus)
# Subset zikv to only the first 30 bases
subZikv <- subseq(zikv, end = 30)
subZikv
# The reverse of zikv is
reverse(zikv)
# The complement of zikv is
complement(zikv)
# The reverse complement of zikv is
reverseComplement(zikv)
# The translation of zikv is
translate(zikv)
# Find palindromes in zikv
findPalindromes(zikv)
# print the rnaframesZikaSet
rnaframesZikaSet
# translate all 6 reading frames
AAzika6F <- translate(rnaframesZikaSet)
AAzika6F
# Count the matches allowing 15 mistmatches
vcountPattern(pattern = ns5, subject = AAzika6F, max.mismatch = 15)
# Select the frame that contains the match
selectedSet <- AAzika6F[3]
#Convert this frame into a single sequence
selectedSeq <- unlist(selectedSet)
# Use vmatchPattern with the set
vmatchPattern(pattern = ns5, subject = selectedSet, max.mismatch = 15)
# Use matchPattern with the single sequence
matchPattern(pattern = ns5, subject = selectedSeq, max.mismatch = 15)
Chapter 3 - IRanges and GenomicRanges
IRanges and Genomic Structures:
Gene of Interest:
Manipulating collections of GRanges:
Example code includes:
# load package IRanges
library(IRanges)
# start vector 1 to 5 and end 100
IRnum1 <- IRanges(start=1:5, end=100)
# end 100 and width 89 and 10
IRnum2 <- IRanges(end=100, width=c(89, 10))
# logical argument start = Rle(c(F, T, T, T, F, T, T, T))
IRlog1 <- IRanges(start = Rle(c(FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE)))
# Printing objects in a list
print(list(IRnum1 = IRnum1, IRnum2 = IRnum2, IRlog1 = IRlog1))
# Load Package Genomic Ranges
library(GenomicRanges)
# Print the GRanges object
myGR
# Check the metadata, if any
mcols(myGR)
# load human reference genome hg38
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
# assign hg38 to hg, then print it
hg <- TxDb.Hsapiens.UCSC.hg38.knownGene
hg
# extract all the genes in chromosome X as hg_chrXg, then print it
hg_chrXg <- genes(hg, filter = list(tx_chrom = c("chrX")))
hg_chrXg
# extract all positive stranded genes in chromosome X as hg_chrXgp, then sort it
hg_chrXgp <- genes(hg, filter = list(tx_chrom = c("chrX"), tx_strand = "+"))
sort(hg_chrXgp)
# load the human transcripts DB to hg
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
hg <- TxDb.Hsapiens.UCSC.hg38.knownGene
# prefilter chromosome X
seqlevels(hg) <- c("chrX")
# get all transcripts by gene
hg_chrXt <- transcriptsBy(hg, by="gene")
# select gene `215` from the transcripts
hg_chrXt[[215]]
# load the human transcripts DB to hg
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
hg <- TxDb.Hsapiens.UCSC.hg38.knownGene
# prefilter chromosome X
seqlevels(hg) <- c("chrX")
# get all transcripts by gene
hg_chrXt <- transcriptsBy(hg, by="gene")
# select gene `215` from the transcripts
hg_chrXt[['215']]
# Store the overlapping range in rangefound
rangefound <- subsetByOverlaps(hg_chrX, ABCD1)
# Check names of rangefound
names(rangefound)
# Check the geneOfInterest
ABCD1
# Check rangefound
rangefound
Chapter 4 - Introducing ShortRead
Sequence Files:
unique sequence identifier
Sequence Quality:
Match and Filter:
Multiple Assessment:
Introduction to Bioconductor:
Example code includes:
# load ShortRead
library(ShortRead)
# print fqsample
fqsample
# class of fqsample
class(fqsample)
# class sread fqsample
class(sread(fqsample))
# id fqsample
id(fqsample)
qaSummary <- qa(fqsample, type = "fastq", lane = 1)
# load ShortRead
library(ShortRead)
# Check quality
quality(fqsample)
# Check encoding
encoding(quality(fqsample))
# Check baseQuality
qaSummary[["baseQuality"]]
# glimpse nucByCycle
glimpse(nucByCycle)
# make an awesome plot!
nucByCycle %>%
# gather the nucleotide letters in alphabet and get a new count column
gather(key = alphabet, value = count , -cycle) %>%
ggplot(aes(x = cycle, y = count, colour = alphabet)) +
geom_line(size = 0.5 ) +
labs(y = "Frequency") +
theme_bw() +
theme(panel.grid.major.x = element_blank())
myStartFilter <- srFilter(function(x) substr(sread(x), 1, 5) == "ATGCA")
# Load package ShortRead
library(ShortRead)
# Check class of fqsample
class(fqsample)
# filter reads into selectedReads using myStartFilter
selectedReads <- fqsample[myStartFilter(fqsample)]
# Check class of selectedReads
class(selectedReads)
# Check detail of selectedReads
detail(selectedReads)
# Load package Rqc
library(Rqc)
# Average per cycle quality plot
rqcCycleAverageQualityPlot(qa)
# Average per cycle quality plot with white background
rqcCycleAverageQualityPlot(qa) + theme_minimal()
# Read quality plot with white background
rqcReadQualityPlot(qa) + theme_minimal()
Chapter 1 - Introduction to Generalized Additive Models
Introduction:
Basis functions and smoothing:
Multivariate GAMs:
Example code includes:
data(mcycle, package="MASS")
# Examine the mcycle data frame
head(mcycle)
## times accel
## 1 2.4 0.0
## 2 2.6 -1.3
## 3 3.2 -2.7
## 4 3.6 0.0
## 5 4.0 -2.7
## 6 6.2 -2.7
plot(mcycle)
# Fit a linear model
lm_mod <- lm(accel ~ times, data = mcycle)
# Visualize the model
termplot(lm_mod, partial.resid = TRUE, se = TRUE)
# Load mgcv
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:forecast':
##
## getResponse
## The following object is masked from 'package:lme4':
##
## lmList
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-24. For overview type 'help("mgcv-package")'.
# Fit the model
gam_mod <- gam(accel ~ s(times), data = mcycle)
# Plot the results
plot(gam_mod, residuals = TRUE, pch = 1)
# Extract the model coefficients
coef(gam_mod)
## (Intercept) s(times).1 s(times).2 s(times).3 s(times).4 s(times).5
## -25.545865 -63.718008 43.475644 -110.350132 -22.181006 35.034423
## s(times).6 s(times).7 s(times).8 s(times).9
## 93.176458 -9.283018 -111.661472 17.603782
# Fit a GAM with 3 basis functions
gam_mod_k3 <- gam(accel ~ s(times, k = 3), data = mcycle)
# Fit with 20 basis functions
gam_mod_k20 <- gam(accel ~ s(times, k = 20), data = mcycle)
# Visualize the GAMs
par(mfrow = c(1, 2))
plot(gam_mod_k3, residuals = TRUE, pch = 1)
plot(gam_mod_k20, residuals = TRUE, pch = 1)
par(mfrow = c(1, 1))
# Extract the smoothing parameter
gam_mod <- gam(accel ~ s(times), data = mcycle, method = "REML")
gam_mod$sp
## s(times)
## 0.0007758036
# Fix the smoothing paramter at 0.1
gam_mod_s1 <- gam(accel ~ s(times), data = mcycle, sp = 0.1)
# Fix the smoothing paramter at 0.0001
gam_mod_s2 <- gam(accel ~ s(times), data = mcycle, sp = 0.0001)
# Plot both models
par(mfrow = c(2, 1))
plot(gam_mod_s1, residuals = TRUE, pch = 1)
plot(gam_mod_s2, residuals = TRUE, pch = 1)
par(mfrow = c(1, 1))
# Fit the GAM
gam_mod_sk <- gam(accel ~ s(times, k=50), sp=0.0001, data=mcycle)
#Visualize the model
plot(gam_mod_sk, residuals = TRUE, pch = 1)
data(mpg, package="gamair")
# Examine the data
head(mpg)
## symbol loss make fuel aspir doors style drive eng.loc wb
## 1 3 NA alfa-romero gas std two convertible rwd front 88.6
## 2 3 NA alfa-romero gas std two convertible rwd front 88.6
## 3 1 NA alfa-romero gas std two hatchback rwd front 94.5
## 4 2 164 audi gas std four sedan fwd front 99.8
## 5 2 164 audi gas std four sedan 4wd front 99.4
## 6 2 NA audi gas std two sedan fwd front 99.8
## length width height weight eng.type cylinders eng.cc fuel.sys bore
## 1 168.8 64.1 48.8 2548 dohc four 130 mpfi 3.47
## 2 168.8 64.1 48.8 2548 dohc four 130 mpfi 3.47
## 3 171.2 65.5 52.4 2823 ohcv six 152 mpfi 2.68
## 4 176.6 66.2 54.3 2337 ohc four 109 mpfi 3.19
## 5 176.6 66.4 54.3 2824 ohc five 136 mpfi 3.19
## 6 177.3 66.3 53.1 2507 ohc five 136 mpfi 3.19
## stroke comp.ratio hp rpm city.mpg hw.mpg price
## 1 2.68 9.0 111 5000 21 27 13495
## 2 2.68 9.0 111 5000 21 27 16500
## 3 3.47 9.0 154 5000 19 26 16500
## 4 3.40 10.0 102 5500 24 30 13950
## 5 3.40 8.0 115 5500 18 22 17450
## 6 3.40 8.5 110 5500 19 25 15250
str(mpg)
## 'data.frame': 205 obs. of 26 variables:
## $ symbol : int 3 3 1 2 2 2 1 1 1 0 ...
## $ loss : int NA NA NA 164 164 NA 158 NA 158 NA ...
## $ make : Factor w/ 22 levels "alfa-romero",..: 1 1 1 2 2 2 2 2 2 2 ...
## $ fuel : Factor w/ 2 levels "diesel","gas": 2 2 2 2 2 2 2 2 2 2 ...
## $ aspir : Factor w/ 2 levels "std","turbo": 1 1 1 1 1 1 1 1 2 2 ...
## $ doors : Factor w/ 2 levels "four","two": 2 2 2 1 1 2 1 1 1 2 ...
## $ style : Factor w/ 5 levels "convertible",..: 1 1 3 4 4 4 4 5 4 3 ...
## $ drive : Factor w/ 3 levels "4wd","fwd","rwd": 3 3 3 2 1 2 2 2 2 1 ...
## $ eng.loc : Factor w/ 2 levels "front","rear": 1 1 1 1 1 1 1 1 1 1 ...
## $ wb : num 88.6 88.6 94.5 99.8 99.4 ...
## $ length : num 169 169 171 177 177 ...
## $ width : num 64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
## $ height : num 48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
## $ weight : int 2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
## $ eng.type : Factor w/ 7 levels "dohc","dohcv",..: 1 1 6 4 4 4 4 4 4 4 ...
## $ cylinders : Factor w/ 7 levels "eight","five",..: 3 3 4 3 2 2 2 2 2 2 ...
## $ eng.cc : int 130 130 152 109 136 136 136 136 131 131 ...
## $ fuel.sys : Factor w/ 8 levels "1bbl","2bbl",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ bore : num 3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
## $ stroke : num 2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
## $ comp.ratio: num 9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
## $ hp : int 111 111 154 102 115 110 110 110 140 160 ...
## $ rpm : int 5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
## $ city.mpg : int 21 21 19 24 18 19 19 19 17 16 ...
## $ hw.mpg : int 27 27 26 30 22 25 25 25 20 22 ...
## $ price : int 13495 16500 16500 13950 17450 15250 17710 18920 23875 NA ...
# Fit the model
mod_city <- gam(city.mpg ~ s(weight) + s(length) + s(price), data = mpg, method = "REML")
# Plot the model
plot(mod_city, pages = 1)
# Fit the model
mod_city2 <- gam(city.mpg ~ s(weight) + s(length) + s(price) + fuel + drive + style, data = mpg, method = "REML")
# Plot the model
plot(mod_city2, all.terms = TRUE, pages = 1)
# Fit the model
mod_city3 <- gam(city.mpg ~ s(weight, by=drive) + s(length, by=drive) + s(price, by=drive) + drive,
data = mpg, method = "REML"
)
# Plot the model
plot(mod_city3, pages = 1)
Chapter 2 - Interpreting and Visualizing GAMs
Interpreting GAM Outputs:
Visualizing GAMs:
Model checking with gam.check():
Checking concurvity:
Example code includes:
# Fit the model
mod_city4 <- gam(city.mpg ~ s(weight) + s(length) + s(price) + s(rpm) + s(width),
data = mpg, method = "REML")
# View the summary
summary(mod_city4)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## city.mpg ~ s(weight) + s(length) + s(price) + s(rpm) + s(width)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.201 0.188 134 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(weight) 5.620 6.799 17.524 < 2e-16 ***
## s(length) 2.943 3.759 0.904 0.420
## s(price) 1.000 1.000 16.647 6.68e-05 ***
## s(rpm) 7.751 8.499 16.486 < 2e-16 ***
## s(width) 1.003 1.005 0.006 0.939
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.831 Deviance explained = 84.7%
## -REML = 496.47 Scale est. = 7.0365 n = 199
# Fit the model
mod <- gam(accel ~ s(times), data = mcycle, method = "REML")
# Make the plot with residuals
plot(mod, residuals=TRUE)
# Change shape of residuals
plot(mod, residuals=TRUE, pch=1, cex=1)
# Fit the model
mod <- gam(hw.mpg ~ s(weight) + s(rpm) + s(price) + comp.ratio,
data = mpg, method = "REML")
# Plot the price effect
plot(mod, select=c(3))
# Plot all effects
plot(mod, all.terms=TRUE, pages=1)
# Plot the weight effect with colored shading
plot(mod, select = 1, shade=TRUE, shade.col="hotpink")
# Add the intercept value and uncertainty
plot(mod, select = 1, shade=TRUE, shade.col="hotpink", seWithMean=TRUE, shift=coef(mod)[1])
dat <- data.frame(y=c(11.17, 2.81, 12.9, 5.68, 5.58, -1.09, 5.42, 12.13, 4.73, 6.29, 5.74, 8.32, 9.76, 4.78, 9.08, 10.5, 9.4, 9.51, 14.58, 13.84, 4.01, 3.31, 5.32, 6.6, 10.54, 13.19, 10.06, 8.6, -0.62, 4.78, 5.98, 2.75, 1.36, 8.51, 8.12, 4.18, 10.65, 5.92, -0.03, 6.48, 9.12, 6.57, 15.38, 11.76, 7.47, 12, 3.4, 3.39, 0.95, 5.49, 7.92, 8.04, 8.81, 6.65, 8.93, 0.55, 6.73, 3.38, 4.42, 8.23, 12.2, 14.45, 2.82, 5.58, 8.74, 14.14, 5.74, 4.59, 14.54, 6.65, 4.21, 8.71, 1.76, 6.22, 8.87, 10.3, 9.18, 5.05, 5.44, 4.86, 3.25, 4.59, 12.01, 6.69, 6.3, 6.85, 5.45, 15.43, -0.9, 3.43, 9.83, 1.04, 1.16, 16.7, 9.16, 8.46, 7.81, 4.97, 7.46, 1.49, 8.01, 9.48, 9.43, 3.92, 6.2, 7.63, 8.56, 11.53, 9.98, 2.49, 5.67, 3.48, 7.92, 8.62, 7.44, 6.35, 10.88, 9.74, 3.79, 15.43, 6.56, 2.5, 6.66, 9.75, 12.72, 14.64, 8.9, 10.74, 5.93, 2.53, 3.69, 15.25, 0.5, 11.8, 13.19, 6.05, -1.26, 9.09, 9.78, 7.23, 11.67, 12.54, -0.36, 9.4, 7.87, 13.46, 9.33, 2.55, 9.23, 5.95, 10.46, 3.39, 3.81, 7.25, 3.94, 10.18, 8.63, 11.51, 2.42, 9.44, 5.95, 7.75, 10.16, 16.11, 5.16, 3.13, 7.75, 9.96, 7.27, 14.62, 3.88, 10.2, 5.86, 16.18, 5.4, 1.55, 2.91, 9.16, 9.77, 2.25, 5.01, 8.79, 3.34, 7.09, 8.18, 3.34, 8.02, 8.12, 6.69, 3.22, 8.15, 5.01, 11.51, 6.62, 7.07, 0.52, 10.26, 7.99, 8.98, 9.87),
x0=c(0.9, 0.27, 0.37, 0.57, 0.91, 0.2, 0.9, 0.94, 0.66, 0.63, 0.06, 0.21, 0.18, 0.69, 0.38, 0.77, 0.5, 0.72, 0.99, 0.38, 0.78, 0.93, 0.21, 0.65, 0.13, 0.27, 0.39, 0.01, 0.38, 0.87, 0.34, 0.48, 0.6, 0.49, 0.19, 0.83, 0.67, 0.79, 0.11, 0.72, 0.41, 0.82, 0.65, 0.78, 0.55, 0.53, 0.79, 0.02, 0.48, 0.73, 0.69, 0.48, 0.86, 0.44, 0.24, 0.07, 0.1, 0.32, 0.52, 0.66, 0.41, 0.91, 0.29, 0.46, 0.33, 0.65, 0.26, 0.48, 0.77, 0.08, 0.88, 0.34, 0.84, 0.35, 0.33, 0.48, 0.89, 0.86, 0.39, 0.78, 0.96, 0.43, 0.71, 0.4, 0.33, 0.76, 0.2, 0.71, 0.12, 0.25, 0.14, 0.24, 0.06, 0.64, 0.88, 0.78, 0.8, 0.46, 0.41, 0.81, 0.6, 0.65, 0.35, 0.27, 0.99, 0.63, 0.21, 0.13, 0.48, 0.92, 0.6, 0.98, 0.73, 0.36, 0.43, 0.15, 0.01, 0.72, 0.1, 0.45, 0.64, 0.99, 0.5, 0.48, 0.17, 0.75, 0.45, 0.51, 0.21, 0.23, 0.6, 0.57, 0.08, 0.04, 0.64, 0.93, 0.6, 0.56, 0.53, 0.99, 0.51, 0.68, 0.6, 0.24, 0.26, 0.73, 0.45, 0.18, 0.75, 0.1, 0.86, 0.61, 0.56, 0.33, 0.45, 0.5, 0.18, 0.53, 0.08, 0.28, 0.21, 0.28, 0.9, 0.45, 0.78, 0.88, 0.41, 0.06, 0.34, 0.72, 0.34, 0.63, 0.84, 0.86, 0.39, 0.38, 0.9, 0.64, 0.74, 0.61, 0.9, 0.29, 0.19, 0.89, 0.5, 0.88, 0.19, 0.76, 0.72, 0.94, 0.55, 0.71, 0.39, 0.1, 0.93, 0.28, 0.59, 0.11, 0.84, 0.32),
x1=c(0.78, 0.27, 0.22, 0.52, 0.27, 0.18, 0.52, 0.56, 0.13, 0.26, 0.72, 0.96, 0.1, 0.76, 0.95, 0.82, 0.31, 0.65, 0.95, 0.95, 0.34, 0.26, 0.17, 0.32, 0.51, 0.92, 0.51, 0.31, 0.05, 0.42, 0.85, 0.35, 0.13, 0.37, 0.63, 0.39, 0.69, 0.69, 0.55, 0.43, 0.45, 0.31, 0.58, 0.91, 0.14, 0.42, 0.21, 0.43, 0.13, 0.46, 0.94, 0.76, 0.93, 0.47, 0.6, 0.48, 0.11, 0.25, 0.5, 0.37, 0.93, 0.52, 0.32, 0.28, 0.79, 0.7, 0.17, 0.06, 0.75, 0.62, 0.17, 0.06, 0.11, 0.38, 0.17, 0.3, 0.19, 0.26, 0.18, 0.48, 0.77, 0.03, 0.53, 0.88, 0.37, 0.05, 0.14, 0.32, 0.15, 0.13, 0.22, 0.23, 0.13, 0.98, 0.33, 0.51, 0.68, 0.1, 0.12, 0.05, 0.93, 0.67, 0.09, 0.49, 0.46, 0.38, 0.99, 0.18, 0.81, 0.07, 0.4, 0.14, 0.19, 0.84, 0.72, 0.27, 0.5, 0.08, 0.35, 0.97, 0.62, 0.66, 0.31, 0.41, 1, 0.86, 0.95, 0.81, 0.78, 0.27, 0.76, 0.99, 0.29, 0.4, 0.81, 0.08, 0.36, 0.44, 0.16, 0.58, 0.97, 0.99, 0.18, 0.54, 0.38, 0.68, 0.27, 0.47, 0.17, 0.37, 0.73, 0.49, 0.06, 0.78, 0.42, 0.98, 0.28, 0.85, 0.08, 0.89, 0.47, 0.11, 0.33, 0.84, 0.28, 0.59, 0.84, 0.07, 0.7, 0.7, 0.46, 0.44, 0.56, 0.93, 0.23, 0.22, 0.42, 0.33, 0.86, 0.18, 0.49, 0.43, 0.56, 0.66, 0.98, 0.23, 0.24, 0.8, 0.83, 0.11, 0.96, 0.15, 0.14, 0.93, 0.51, 0.15, 0.35, 0.66, 0.31, 0.35),
x2=c(0.15, 0.66, 0.19, 0.95, 0.9, 0.94, 0.72, 0.37, 0.78, 0.01, 0.94, 0.99, 0.36, 0.75, 0.79, 0.71, 0.48, 0.49, 0.31, 0.7, 0.82, 0.43, 0.51, 0.66, 0.14, 0.34, 0.41, 0.09, 0.93, 0.84, 0.88, 0.94, 0.07, 0.38, 0.54, 0.11, 0.8, 0.74, 0.05, 0.48, 0.92, 0.04, 0.29, 0.5, 0.61, 0.26, 0.42, 0.37, 0.94, 0.12, 0.07, 0.96, 0.44, 0.37, 0.14, 0.05, 0.66, 0.58, 0.99, 0.6, 0.06, 0.16, 0.48, 0, 0.44, 0.26, 0.94, 0.72, 0.16, 0.48, 0.69, 0.46, 0.96, 0.71, 0.4, 0.12, 0.24, 0.86, 0.44, 0.5, 0.69, 0.76, 0.16, 0.85, 0.95, 0.59, 0.5, 0.19, 0, 0.88, 0.13, 0.02, 0.94, 0.29, 0.16, 0.4, 0.46, 0.43, 0.52, 0.85, 0.06, 0.55, 0.69, 0.66, 0.66, 0.47, 0.97, 0.4, 0.85, 0.76, 0.53, 0.87, 0.47, 0.01, 0.73, 0.72, 0.19, 0.65, 0.54, 0.34, 0.64, 0.83, 0.71, 0.35, 0.13, 0.39, 0.93, 0.8, 0.76, 0.96, 0.99, 0.61, 0.03, 0.34, 0.28, 0.12, 0.04, 0.37, 0.34, 0.17, 0.62, 0.4, 0.96, 0.65, 0.33, 0.2, 0.12, 1, 0.38, 0.56, 0.73, 0.87, 0.57, 0.01, 0.91, 0.77, 0.38, 0.09, 0.05, 0.82, 0.83, 0.65, 0.13, 0.34, 0.73, 0.91, 0.7, 0.24, 0.64, 0.28, 0.96, 0.16, 0.42, 0.25, 0.09, 0.83, 0.53, 0.67, 0.41, 0.84, 0.74, 0.35, 0.95, 0.65, 0.04, 0.6, 0.42, 0.08, 0.53, 0.96, 0.71, 0.55, 0.24, 0.78, 0.65, 0.83, 0.65, 0.48, 0.5, 0.38),
x3=c(0.45, 0.81, 0.93, 0.15, 0.75, 0.98, 0.97, 0.35, 0.39, 0.95, 0.11, 0.93, 0.35, 0.53, 0.54, 0.71, 0.41, 0.15, 0.34, 0.63, 0.06, 0.85, 0.21, 0.77, 0.14, 0.32, 0.62, 0.26, 0.63, 0.49, 0.94, 0.86, 0.37, 0.31, 0.83, 0.45, 0.32, 0.1, 0.06, 0.69, 0.67, 0.9, 0.3, 0.93, 0.2, 0.79, 0.22, 0.03, 0.86, 0.69, 0.94, 0.68, 0.84, 0.36, 0.39, 0.57, 0.1, 0.19, 0.59, 0.75, 0.87, 0.37, 0.8, 0.06, 0.62, 0.36, 0.59, 0.91, 0.2, 0.37, 0.67, 0.77, 0.52, 0.83, 0.53, 0.5, 0.42, 0.36, 0.12, 0.3, 0.28, 0.79, 0.78, 0.14, 0.52, 0.6, 0.51, 0.39, 0.43, 0.01, 0.92, 0.08, 0.51, 0.82, 0.6, 0.42, 0.56, 0.79, 0.17, 0.97, 0.47, 0.93, 0.9, 0.75, 0.68, 0.65, 0.07, 0.42, 0.53, 0.94, 0.71, 0.72, 0.47, 0.12, 0.78, 0.44, 0.43, 0.03, 0.15, 0.42, 0.77, 0, 0.6, 0.91, 0.71, 0.26, 0.85, 0.33, 0.58, 0.43, 0.05, 0.73, 0.55, 0.75, 0.05, 0.71, 0.3, 0.28, 0.83, 0.09, 0.04, 0.35, 0.54, 0.61, 0.27, 0.21, 0.38, 0.47, 0.84, 0.12, 0.68, 0.5, 0.9, 0.55, 0.13, 0.44, 0.19, 0.43, 0.23, 0.96, 0.45, 0.78, 0.16, 0.87, 0.21, 0.18, 0.16, 0.57, 0.73, 0.88, 0.71, 0.48, 0.82, 0.02, 1, 0.63, 0.43, 0.03, 0.75, 0.21, 1, 0.91, 0.71, 0.73, 0.47, 0.86, 0.17, 0.62, 0.29, 0.46, 0.05, 0.18, 0.06, 0.94, 0.34, 0.52, 0.63, 0.24, 0.52, 0.81)
)
str(dat)
## 'data.frame': 200 obs. of 5 variables:
## $ y : num 11.17 2.81 12.9 5.68 5.58 ...
## $ x0: num 0.9 0.27 0.37 0.57 0.91 0.2 0.9 0.94 0.66 0.63 ...
## $ x1: num 0.78 0.27 0.22 0.52 0.27 0.18 0.52 0.56 0.13 0.26 ...
## $ x2: num 0.15 0.66 0.19 0.95 0.9 0.94 0.72 0.37 0.78 0.01 ...
## $ x3: num 0.45 0.81 0.93 0.15 0.75 0.98 0.97 0.35 0.39 0.95 ...
# Fit the model
mod <- gam(y ~ s(x0, k = 5) + s(x1, k = 5) + s(x2, k = 5) + s(x3, k = 5),
data = dat, method = "REML")
# Run the check function
gam.check(mod)
##
## Method: REML Optimizer: outer newton
## full convergence after 10 iterations.
## Gradient range [-0.0001426464,0.0001241444]
## (score 461.1064 & scale 5.242973).
## Hessian positive definite, eigenvalue range [0.0001426384,97.53228].
## Model rank = 17 / 17
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(x0) 4.00 2.53 0.92 0.12
## s(x1) 4.00 2.22 1.07 0.80
## s(x2) 4.00 3.94 0.84 <2e-16 ***
## s(x3) 4.00 1.00 1.01 0.48
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Fit the model
mod <- gam(y ~ s(x0, k = 3) + s(x1, k = 3) + s(x2, k = 3) + s(x3, k = 3),
data = dat, method = "REML")
# Check the diagnostics
gam.check(mod)
##
## Method: REML Optimizer: outer newton
## full convergence after 10 iterations.
## Gradient range [-0.0002159481,0.0007368124]
## (score 493.6694 & scale 7.805066).
## Hessian positive definite, eigenvalue range [0.0002170151,97.50484].
## Model rank = 9 / 9
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(x0) 2.00 1.85 0.97 0.30
## s(x1) 2.00 1.71 1.06 0.75
## s(x2) 2.00 1.97 0.57 <2e-16 ***
## s(x3) 2.00 1.00 1.09 0.85
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Refit to fix issues
mod2 <- gam(y ~ s(x0, k = 3) + s(x1, k = 3) + s(x2, k = 12) + s(x3, k = 3),
data = dat, method = "REML")
# Check the new model
gam.check(mod2)
##
## Method: REML Optimizer: outer newton
## full convergence after 9 iterations.
## Gradient range [-0.0001262011,0.0001907036]
## (score 452.0731 & scale 4.569005).
## Hessian positive definite, eigenvalue range [0.01536015,97.63581].
## Model rank = 18 / 18
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(x0) 2.00 1.93 0.91 0.10
## s(x1) 2.00 1.89 1.12 0.97
## s(x2) 11.00 8.07 0.97 0.29
## s(x3) 2.00 1.18 1.04 0.76
# Fit the model
mod <- gam(hw.mpg ~ s(length) + s(width) + s(height) + s(weight),
data = mpg, method = "REML")
# Check overall concurvity
concurvity(mod, full=TRUE)
## para s(length) s(width) s(height) s(weight)
## worst 1.079374e-20 0.9303404 0.9322887 0.6723705 0.9603887
## observed 1.079374e-20 0.7534619 0.8757513 0.4869308 0.8793300
## estimate 1.079374e-20 0.8353324 0.7943374 0.4452676 0.8567519
# Check pairwise concurvity
concurvity(mod, full=FALSE)
## $worst
## para s(length) s(width) s(height) s(weight)
## para 1.000000e+00 4.799804e-26 5.458174e-21 4.926340e-23 3.221614e-25
## s(length) 4.759962e-26 1.000000e+00 8.336513e-01 6.058015e-01 8.797217e-01
## s(width) 5.458344e-21 8.336513e-01 1.000000e+00 4.099837e-01 8.953662e-01
## s(height) 4.927251e-23 6.058015e-01 4.099837e-01 1.000000e+00 3.665831e-01
## s(weight) 3.233688e-25 8.797217e-01 8.953662e-01 3.665831e-01 1.000000e+00
##
## $observed
## para s(length) s(width) s(height) s(weight)
## para 1.000000e+00 1.128295e-29 4.467995e-32 9.887661e-34 6.730965e-31
## s(length) 4.759962e-26 1.000000e+00 7.511142e-01 2.827977e-01 8.232449e-01
## s(width) 5.458344e-21 5.077384e-01 1.000000e+00 1.186126e-01 7.813743e-01
## s(height) 4.927251e-23 2.284116e-01 3.313152e-01 1.000000e+00 2.900361e-01
## s(weight) 3.233688e-25 6.052819e-01 7.863555e-01 1.494913e-01 1.000000e+00
##
## $estimate
## para s(length) s(width) s(height) s(weight)
## para 1.000000e+00 1.564968e-28 1.740649e-23 3.448567e-25 1.481483e-27
## s(length) 4.759962e-26 1.000000e+00 6.415191e-01 2.271285e-01 7.209033e-01
## s(width) 5.458344e-21 6.477497e-01 1.000000e+00 1.054762e-01 7.241891e-01
## s(height) 4.927251e-23 3.303484e-01 2.644827e-01 1.000000e+00 2.669300e-01
## s(weight) 3.233688e-25 7.235198e-01 6.913221e-01 1.390568e-01 1.000000e+00
Chapter 3 - Spatial GAMs and Interactions
Two-dimensional smooths and spatial data:
Plotting and interpreting GAM interactions:
Visualizing categorical-continuous interactions:
Interactions with different scales: Tensors:
Example code includes:
# Inspect the data
data(meuse, package="sp")
head(meuse)
## x y cadmium copper lead zinc elev dist om ffreq soil
## 1 181072 333611 11.7 85 299 1022 7.909 0.00135803 13.6 1 1
## 2 181025 333558 8.6 81 277 1141 6.983 0.01222430 14.0 1 1
## 3 181165 333537 6.5 68 199 640 7.800 0.10302900 13.0 1 1
## 4 181298 333484 2.6 81 116 257 7.655 0.19009400 8.0 1 2
## 5 181307 333330 2.8 48 117 269 7.480 0.27709000 8.7 1 2
## 6 181390 333260 3.0 61 137 281 7.791 0.36406700 7.8 1 2
## lime landuse dist.m
## 1 1 Ah 50
## 2 1 Ah 30
## 3 1 Ah 150
## 4 0 Ga 270
## 5 0 Ah 380
## 6 0 Ga 470
str(meuse)
## 'data.frame': 155 obs. of 14 variables:
## $ x : num 181072 181025 181165 181298 181307 ...
## $ y : num 333611 333558 333537 333484 333330 ...
## $ cadmium: num 11.7 8.6 6.5 2.6 2.8 3 3.2 2.8 2.4 1.6 ...
## $ copper : num 85 81 68 81 48 61 31 29 37 24 ...
## $ lead : num 299 277 199 116 117 137 132 150 133 80 ...
## $ zinc : num 1022 1141 640 257 269 ...
## $ elev : num 7.91 6.98 7.8 7.66 7.48 ...
## $ dist : num 0.00136 0.01222 0.10303 0.19009 0.27709 ...
## $ om : num 13.6 14 13 8 8.7 7.8 9.2 9.5 10.6 6.3 ...
## $ ffreq : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ soil : Factor w/ 3 levels "1","2","3": 1 1 1 2 2 2 2 1 1 2 ...
## $ lime : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...
## $ landuse: Factor w/ 15 levels "Aa","Ab","Ag",..: 4 4 4 11 4 11 4 2 2 15 ...
## $ dist.m : num 50 30 150 270 380 470 240 120 240 420 ...
# Fit the 2-D model
mod2d <- gam(cadmium ~ s(x, y), data=meuse, method="REML")
# Inspect the model
summary(mod2d)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## cadmium ~ s(x, y)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2458 0.1774 18.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(x,y) 23.48 27.24 8.667 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.607 Deviance explained = 66.7%
## -REML = 372.07 Scale est. = 4.8757 n = 155
coef(mod2d)
## (Intercept) s(x,y).1 s(x,y).2 s(x,y).3 s(x,y).4 s(x,y).5
## 3.2458065 0.8686658 -10.2154908 6.4161781 -2.6784725 9.1807111
## s(x,y).6 s(x,y).7 s(x,y).8 s(x,y).9 s(x,y).10 s(x,y).11
## 3.7004932 -10.4780937 -8.9821840 -0.6218677 -4.6789789 -5.4267313
## s(x,y).12 s(x,y).13 s(x,y).14 s(x,y).15 s(x,y).16 s(x,y).17
## 7.4996452 8.1962843 -7.6311640 4.5829340 -0.9734724 0.7634059
## s(x,y).18 s(x,y).19 s(x,y).20 s(x,y).21 s(x,y).22 s(x,y).23
## 8.8112827 -4.8639552 -6.8085148 3.8059356 6.3499868 6.4701169
## s(x,y).24 s(x,y).25 s(x,y).26 s(x,y).27 s(x,y).28 s(x,y).29
## -8.1556061 7.2050985 0.1567317 -53.4384704 -4.2860149 5.5212533
# Models of this form (s(x,y) + s(v1) + ...) are a great way to model spatial data because they incorporate spatial relationships as well as independent predictors
# Fit the model
mod2da <- gam(cadmium ~ s(x, y) +s(elev) + s(dist),
data = meuse, method = "REML")
# Inspect the model
summary(mod2da)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## cadmium ~ s(x, y) + s(elev) + s(dist)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2458 0.1238 26.21 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(x,y) 20.398 24.599 2.324 0.00078 ***
## s(elev) 1.282 1.496 28.868 6.52e-08 ***
## s(dist) 6.609 7.698 13.677 5.25e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.809 Deviance explained = 84.4%
## -REML = 321.06 Scale est. = 2.3762 n = 155
# Contour plot
plot(mod2da, pages = 1)
# 3D surface plot
plot(mod2da, scheme=1, pages = 1)
# Colored heat map
plot(mod2da, scheme=2, pages=1)
# Make the perspective plot with error surfaces
vis.gam(mod2d, view = c("x", "y"), plot.type="persp", se=2)
# Rotate the same plot
vis.gam(mod2d, view = c("x", "y"), plot.type="persp", se=2, theta=135)
# Make plot with 5% extrapolation
vis.gam(mod2d, view = c("x", "y"), plot.type = "contour", too.far=0.05)
# Overlay data
points(meuse)
# Make plot with 10% extrapolation
vis.gam(mod2d, view = c("x", "y"), plot.type="contour", too.far=0.1)
# Overlay data
points(meuse)
# Make plot with 25% extrapolation
vis.gam(mod2d, view = c("x", "y"),
plot.type = "contour", too.far = 0.25)
# Overlay data
points(meuse)
# Fit a model with separate smooths for each land-use level
mod_sep <- gam(copper ~ s(dist, by = landuse), data = meuse, method = "REML")
# Examine the summary
summary(mod_sep)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## copper ~ s(dist, by = landuse)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.726 1.371 26.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(dist):landuseAa 1.371 1.605 0.493 0.43082
## s(dist):landuseAb 1.000 1.000 1.674 0.19792
## s(dist):landuseAg 1.514 1.815 0.940 0.28255
## s(dist):landuseAh 2.496 3.081 8.783 1.96e-05 ***
## s(dist):landuseAm 1.000 1.000 8.606 0.00395 **
## s(dist):landuseB 1.000 1.000 1.207 0.27401
## s(dist):landuseBw 1.000 1.000 0.007 0.93520
## s(dist):landuseDEN 1.000 1.000 0.230 0.63255
## s(dist):landuseFh 1.000 1.000 0.698 0.40494
## s(dist):landuseFw 2.754 3.377 5.289 0.00120 **
## s(dist):landuseGa 2.791 2.958 3.720 0.01092 *
## s(dist):landuseSPO 1.000 1.000 1.101 0.29599
## s(dist):landuseSTA 1.245 1.430 0.179 0.65089
## s(dist):landuseTv 1.000 1.000 0.698 0.40495
## s(dist):landuseW 4.333 5.289 37.857 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.642 Deviance explained = 69.9%
## -REML = 580.91 Scale est. = 195.07 n = 154
# Fit a model with a factor-smooth interaction
mod_fs <- gam(copper ~ s(dist, landuse, bs="fs"), data = meuse, method = "REML")
# Examine the summary
summary(mod_fs)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## copper ~ s(dist, landuse, bs = "fs")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.07 3.33 9.031 1.43e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(dist,landuse) 16.37 71 2.463 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.533 Deviance explained = 58.3%
## -REML = 659.94 Scale est. = 254.2 n = 154
# Plot both the models with plot()
plot(mod_sep, pages=1)
plot(mod_fs, pages=1)
# Plot both the models with vis.gam()
vis.gam(mod_sep, view = c("dist", "landuse"), plot.type = "persp")
vis.gam(mod_fs, view = c("dist", "landuse"), plot.type = "persp")
# Fit the model
tensor_mod <- gam(cadmium ~ te(x, y, elev), data=meuse, method="REML")
# Summarize and plot
summary(tensor_mod)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## cadmium ~ te(x, y, elev)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2458 0.1329 24.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## te(x,y,elev) 38.29 45.86 11.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.78 Deviance explained = 83.4%
## -REML = 318.09 Scale est. = 2.7358 n = 155
plot(tensor_mod)
# Fit the model
tensor_mod2 <- gam(cadmium ~ ti(x, y) + te(elev) + ti(x, y, elev), data=meuse, method="REML")
# Summarize and plot
summary(tensor_mod2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## cadmium ~ ti(x, y) + te(elev) + ti(x, y, elev)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.5102 0.4311 8.143 3.61e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## ti(x,y) 10.80 12.132 6.026 7.40e-09 ***
## te(elev) 2.79 3.099 11.317 1.14e-06 ***
## ti(x,y,elev) 17.20 22.376 2.759 0.00017 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.714 Deviance explained = 77.1%
## -REML = 349.54 Scale est. = 3.5476 n = 155
plot(tensor_mod2, pages = 1)
par(mfrow=c(1, 1))
Chapter 4 - Logistic GAM for Classification
Types of model outcome:
Visualizing logistic GAMs:
Making predictions:
Wrap up and next steps:
Example code includes:
csale <- readRDS("./RInputFiles/csale.rds")
# Examine the csale data frame
head(csale)
## # A tibble: 6 x 8
## purchase n_acts bal_crdt_ratio avg_prem_balance retail_crdt_rat~
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 11 0 2494. 0
## 2 0 0 36.1 2494. 11.5
## 3 0 6 17.6 2494. 0
## 4 0 8 12.5 2494. 0.8
## 5 0 8 59.1 2494. 20.8
## 6 0 1 90.1 2494. 11.5
## # ... with 3 more variables: avg_fin_balance <dbl>, mortgage_age <dbl>,
## # cred_limit <dbl>
str(csale)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1779 obs. of 8 variables:
## $ purchase : num 0 0 0 0 0 0 0 0 1 0 ...
## $ n_acts : num 11 0 6 8 8 1 5 0 9 18 ...
## $ bal_crdt_ratio : num 0 36.1 17.6 12.5 59.1 ...
## $ avg_prem_balance : num 2494 2494 2494 2494 2494 ...
## $ retail_crdt_ratio: num 0 11.5 0 0.8 20.8 ...
## $ avg_fin_balance : num 1767 1767 0 1021 797 ...
## $ mortgage_age : num 182 139 139 139 93 ...
## $ cred_limit : num 12500 0 0 0 0 0 0 0 11500 16000 ...
# Fit a logistic model
log_mod <- gam(purchase ~ s(mortgage_age), data = csale, family=binomial, method = "REML")
# Fit a logistic model
log_mod2 <- gam(purchase ~ s(n_acts) + s(bal_crdt_ratio) + s(avg_prem_balance) +
s(retail_crdt_ratio) + s(avg_fin_balance) + s(mortgage_age) +
s(cred_limit), data = csale, family = binomial, method = "REML")
# View the summary
summary(log_mod2)
##
## Family: binomial
## Link function: logit
##
## Formula:
## purchase ~ s(n_acts) + s(bal_crdt_ratio) + s(avg_prem_balance) +
## s(retail_crdt_ratio) + s(avg_fin_balance) + s(mortgage_age) +
## s(cred_limit)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.64060 0.07557 -21.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(n_acts) 3.474 4.310 93.670 < 2e-16 ***
## s(bal_crdt_ratio) 4.308 5.257 18.386 0.00318 **
## s(avg_prem_balance) 2.275 2.816 7.800 0.04958 *
## s(retail_crdt_ratio) 1.001 1.001 1.422 0.23343
## s(avg_fin_balance) 1.850 2.202 2.506 0.27895
## s(mortgage_age) 4.669 5.710 9.656 0.13401
## s(cred_limit) 1.001 1.002 23.066 1.58e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.184 Deviance explained = 18.4%
## -REML = 781.37 Scale est. = 1 n = 1779
# Plot on the log-odds scale
plot(log_mod2, pages=1)
# Plot on the probability scale
plot(log_mod2, pages = 1, trans = plogis)
# Plot with the intercept
plot(log_mod2, pages = 1, trans = plogis, shift = coef(log_mod2)[1])
# Plot with intercept uncertainty
plot(log_mod2, pages = 1, trans = plogis, shift = coef(log_mod2)[1], seWithMean = TRUE)
new_credit_data <- data.frame(matrix(data=c(1, 0, 0, 0, 0, 0, 0, 2, 19, 0, 0, 1, 6, 3, 0.3, 4.2, 36.095, 36.095, 25.7, 45.6, 10.8, 61, 967, 2494.414, 2494.414, 2494.414, 195, 2494.414, 11.491, 0, 11.491, 11.491, 11.491, 0, 11.491, 1767.197, 249, 1767.197, 1767.197, 1767.197, 0, 1767.197, 155, 65, 138.96, 138.96, 138.96, 13, 138.96, 0, 10000, 0, 0, 0, 13800, 0), ncol=8, nrow=7, byrow=FALSE))
names(new_credit_data) <- c('purchase', 'n_acts', 'bal_crdt_ratio', 'avg_prem_balance', 'retail_crdt_ratio', 'avg_fin_balance', 'mortgage_age', 'cred_limit')
new_credit_data
## purchase n_acts bal_crdt_ratio avg_prem_balance retail_crdt_ratio
## 1 1 2 0.300 61.000 11.491
## 2 0 19 4.200 967.000 0.000
## 3 0 0 36.095 2494.414 11.491
## 4 0 0 36.095 2494.414 11.491
## 5 0 1 25.700 2494.414 11.491
## 6 0 6 45.600 195.000 0.000
## 7 0 3 10.800 2494.414 11.491
## avg_fin_balance mortgage_age cred_limit
## 1 1767.197 155.00 0
## 2 249.000 65.00 10000
## 3 1767.197 138.96 0
## 4 1767.197 138.96 0
## 5 1767.197 138.96 0
## 6 0.000 13.00 13800
## 7 1767.197 138.96 0
# Calculate predictions and errors
predictions <- predict(log_mod2, newdata = new_credit_data,
type = "link", se.fit = TRUE)
# Calculate high and low predictions intervals
high_pred <- predictions$fit + 2*predictions$se.fit
low_pred <- predictions$fit - 2*predictions$se.fit
# Convert intervals to probability scale
high_prob <- 1 / (1/exp(high_pred) + 1)
low_prob <- 1 / (1/exp(low_pred) + 1)
# Inspect
high_prob
## 1 2 3 4 5 6
## 0.29534339 0.80264652 0.07015439 0.07015439 0.12907886 0.37928666
## 7
## 0.27248599
low_prob
## 1 2 3 4 5 6
## 0.15023805 0.53570264 0.03758807 0.03758807 0.06804119 0.11279970
## 7
## 0.15491807
# Predict from the model
prediction_1 <- predict(log_mod2, newdata = new_credit_data[1, ,drop=FALSE], type = "terms")
# Inspect
prediction_1
## s(n_acts) s(bal_crdt_ratio) s(avg_prem_balance) s(retail_crdt_ratio)
## 1 -0.3626621 0.3352521 0.369506 -0.007531015
## s(avg_fin_balance) s(mortgage_age) s(cred_limit)
## 1 -0.04057248 -0.1774484 0.2229033
## attr(,"constant")
## (Intercept)
## -1.640601
Chapter 1 - Foundations of Tidy Machine Learning
Introduction:
Map family of functions:
Tidy models with broom:
Example code includes:
# Explore gapminder
data(gapminder, package="gapminder")
head(gapminder)
# Prepare the nested dataframe gap_nested
gap_nested <- gapminder %>%
group_by(country) %>%
nest()
# Explore gap_nested
head(gap_nested)
# Create the unnested dataframe called gap_unnnested
gap_unnested <- gap_nested %>%
unnest()
# Confirm that your data was not modified
identical(gapminder, gap_unnested)
# Extract the data of Algeria
algeria_df <- gap_nested$data[[which(gap_nested$country=="Algeria")]]
# Calculate the minimum of the population vector
min(algeria_df$pop)
# Calculate the maximum of the population vector
max(algeria_df$pop)
# Calculate the mean of the population vector
mean(algeria_df$pop)
# Calculate the mean population for each country
pop_nested <- gap_nested %>%
mutate(mean_pop = map(.x=data, .f=~mean(.x$pop)))
# Take a look at pop_nested
head(pop_nested)
# Extract the mean_pop value by using unnest
pop_mean <- pop_nested %>%
unnest(mean_pop)
# Take a look at pop_mean
head(pop_mean)
# Calculate mean population and store result as a double
pop_mean <- gap_nested %>%
mutate(mean_pop = map_dbl(.x=data, ~mean(.x$pop)))
# Take a look at pop_mean
head(pop_mean)
# Build a linear model for each country
gap_models <- gap_nested %>%
mutate(model = map(.x=data, .f=~lm(formula = lifeExp ~ year, data = .x)))
# Extract the model for Algeria
algeria_model <- gap_models$model[[which(gap_models$country=="Algeria")]]
# View the summary for the Algeria model
summary(algeria_model)
# Extract the coefficients of the algeria_model as a dataframe
broom::tidy(algeria_model)
# Extract the statistics of the algeria_model as a dataframe
broom::glance(algeria_model)
# Build the augmented dataframe
algeria_fitted <- broom::augment(algeria_model)
# Compare the predicted values with the actual values of life expectancy
algeria_fitted %>%
ggplot(aes(x = year)) +
geom_point(aes(y = lifeExp)) +
geom_line(aes(y = .fitted), color = "red")
Chapter 2 - Multiple Models with broom
Exploring coefficients across models:
Evaluating fit of many models:
Visually inspect the fit of many models:
Improve the fit of your models:
Example code includes:
# Extract the coefficient statistics of each model into nested dataframes
model_coef_nested <- gap_models %>%
mutate(coef = map(.x=model, .f=~broom::tidy(.x)))
# Simplify the coef dataframes for each model
model_coef <- model_coef_nested %>%
unnest(coef)
# Plot a histogram of the coefficient estimates for year
model_coef %>%
filter(term=="year") %>%
ggplot(aes(x = estimate)) +
geom_histogram()
# Extract the fit statistics of each model into dataframes
model_perf_nested <- gap_models %>%
mutate(fit = map(.x=model, .f=~broom::glance(.x)))
# Simplify the fit dataframes for each model
model_perf <- model_perf_nested %>%
unnest(fit)
# Look at the first six rows of model_perf
head(model_perf)
# Plot a histogram of rsquared for the 77 models
model_perf %>%
ggplot(aes(x=r.squared)) +
geom_histogram()
# Extract the 4 best fitting models
best_fit <- model_perf %>%
top_n(n = 4, wt = r.squared)
# Extract the 4 models with the worst fit
worst_fit <- model_perf %>%
top_n(n = 4, wt = -r.squared)
best_augmented <- best_fit %>%
# Build the augmented dataframe for each country model
mutate(augmented = map(.x=model, .f=~broom::augment(.x))) %>%
# Expand the augmented dataframes
unnest(augmented)
worst_augmented <- worst_fit %>%
# Build the augmented dataframe for each country model
mutate(augmented = map(.x=model, .f=~broom::augment(.x))) %>%
# Expand the augmented dataframes
unnest(augmented)
# Compare the predicted values with the actual values of life expectancy
# for the top 4 best fitting models
best_augmented %>%
ggplot(aes(x=year)) +
geom_point(aes(y=lifeExp)) +
geom_line(aes(y=.fitted), color = "red") +
facet_wrap(~country, scales = "free_y")
# Compare the predicted values with the actual values of life expectancy
# for the top 4 worst fitting models
worst_augmented %>%
ggplot(aes(x=year)) +
geom_point(aes(y=lifeExp)) +
geom_line(aes(y=.fitted), color = "red") +
facet_wrap(~country, scales = "free_y")
# Build a linear model for each country using all features
gap_fullmodel <- gap_nested %>%
mutate(model = map(data, ~lm(formula = lifeExp ~ year + pop + gdpPercap, data = .x)))
fullmodel_perf <- gap_fullmodel %>%
# Extract the fit statistics of each model into dataframes
mutate(fit = map(model, ~broom::glance(.x))) %>%
# Simplify the fit dataframes for each model
unnest(fit)
# View the performance for the four countries with the worst fitting
# four simple models you looked at before
fullmodel_perf %>%
filter(country %in% worst_fit$country) %>%
select(country, adj.r.squared)
Chapter 3 - Build, Tune, and Evaluate Regression Models
Training, test, and validation splits:
Measuring cross-validation performance:
Building and tuning a random-forest model:
Measuring the test performance:
Example code includes:
set.seed(42)
# Prepare the initial split object
gap_split <- rsample::initial_split(gapminder, prop = 0.75)
# Extract the training dataframe
training_data <- rsample::training(gap_split)
# Extract the testing dataframe
testing_data <- rsample::testing(gap_split)
# Calculate the dimensions of both training_data and testing_data
dim(training_data)
dim(testing_data)
set.seed(42)
# Prepare the dataframe containing the cross validation partitions
cv_split <- rsample::vfold_cv(training_data, v = 5)
cv_data <- cv_split %>%
mutate(
# Extract the train dataframe for each split
train = map(splits, ~rsample::training(.x)),
# Extract the validate dataframe for each split
validate = map(splits, ~rsample::testing(.x))
)
# Use head() to preview cv_data
head(cv_data)
# Build a model using the train data for each fold of the cross validation
cv_models_lm <- cv_data %>%
mutate(model = map(train, ~lm(formula = lifeExp ~ ., data = .x)))
cv_prep_lm <- cv_models_lm %>%
mutate(
# Extract the recorded life expectancy for the records in the validate dataframes
validate_actual = map(.x=validate, .f=~.x$lifeExp),
# Predict life expectancy for each validate set using its corresponding model
validate_predicted = map2(.x=model, .y=validate, .f=~predict(.x, .y))
)
library(Metrics)
# Calculate the mean absolute error for each validate fold
cv_eval_lm <- cv_prep_lm %>%
mutate(validate_mae = map2_dbl(.x=validate_actual, .y=validate_predicted,
.f=~mae(actual = .x, predicted = .y)
)
)
# Print the validate_mae column
cv_eval_lm$validate_mae
# Calculate the mean of validate_mae column
mean(cv_eval_lm$validate_mae)
library(ranger)
# Build a random forest model for each fold
cv_models_rf <- cv_data %>%
mutate(model = map(train, ~ranger(formula = lifeExp ~ ., data = .x,
num.trees = 100, seed = 42)))
# Generate predictions using the random forest model
cv_prep_rf <- cv_models_rf %>%
mutate(validate_predicted = map2(.x=model, .y=validate, .f=~predict(.x, .y)$predictions))
# Calculate validate MAE for each fold
cv_eval_rf <- cv_prep_rf %>%
mutate(validate_actual=map(.x=validate, .f=~.x$lifeExp),
validate_mae = map2_dbl(.x=validate_actual, .y=validate_predicted,
.f=~mae(actual = .x, predicted = .y)
)
)
# Print the validate_mae column
cv_eval_rf$validate_mae
# Calculate the mean of validate_mae column
mean(cv_eval_rf$validate_mae)
# Prepare for tuning your cross validation folds by varying mtry
cv_tune <- cv_data %>%
tidyr::crossing(mtry = 2:5)
# Build a model for each fold & mtry combination
cv_model_tunerf <- cv_tune %>%
mutate(model = map2(.x=train, .y=mtry, ~ranger(formula = lifeExp ~ .,
data = .x, mtry = .y,
num.trees = 100, seed = 42)))
# Generate validate predictions for each model
cv_prep_tunerf <- cv_model_tunerf %>%
mutate(validate_predicted = map2(.x=model, .y=validate, .f=~predict(.x, .y)$predictions))
# Calculate validate MAE for each fold and mtry combination
cv_eval_tunerf <- cv_prep_tunerf %>%
mutate(validate_actual=map(.x=validate, .f=~.x$lifeExp),
validate_mae = map2_dbl(.x=validate_actual, .y=validate_predicted,
.f=~mae(actual = .x, predicted = .y)
)
)
# Calculate the mean validate_mae for each mtry used
cv_eval_tunerf %>%
group_by(mtry) %>%
summarise(mean_mae = mean(validate_mae))
# Build the model using all training data and the best performing parameter
best_model <- ranger(formula = lifeExp ~ ., data = training_data,
mtry = 4, num.trees = 100, seed = 42)
# Prepare the test_actual vector
test_actual <- testing_data$lifeExp
# Predict life_expectancy for the testing_data
test_predicted <- predict(best_model, testing_data)$predictions
# Calculate the test MAE
mae(test_actual, test_predicted)
Chapter 4 - Build, Tune, and Evaluate Classification Models
Logistic Regression Models:
Evaluating Classification Models:
Random Forest for Classification:
Wrap Up:
Example code includes:
attrition <- readRDS("./RInputFiles/attrition.rds")
str(attrition)
head(attrition)
set.seed(42)
# Prepare the initial split object
data_split <- rsample::initial_split(data=attrition, prop=0.75)
# Extract the training dataframe
training_data <- rsample::training(data_split)
# Extract the testing dataframe
testing_data <- rsample::testing(data_split)
set.seed(42)
cv_split <- rsample::vfold_cv(training_data, v=5)
cv_data <- cv_split %>%
mutate(
# Extract the train dataframe for each split
train = map(.x=splits, .f=~rsample::training(.x)),
# Extract the validate dataframe for each split
validate = map(.x=splits, .f=~rsample::testing(.x))
)
# Build a model using the train data for each fold of the cross validation
cv_models_lr <- cv_data %>%
mutate(model = map(train, ~glm(formula = Attrition ~ ., data = .x, family = "binomial")))
# Extract the first model and validate
model <- cv_models_lr$model[[1]]
validate <- cv_models_lr$validate[[1]]
# Prepare binary vector of actual Attrition values in validate
validate_actual <- validate$Attrition == "Yes"
# Predict the probabilities for the observations in validate
validate_prob <- predict(model, validate, type = "response")
# Prepare binary vector of predicted Attrition values for validate
validate_predicted <- validate_prob > 0.5
library(Metrics)
# Compare the actual & predicted performance visually using a table
table(validate_actual, validate_predicted)
# Calculate the accuracy
accuracy(validate_actual, validate_predicted)
# Calculate the precision
precision(validate_actual, validate_predicted)
# Calculate the recall
recall(validate_actual, validate_predicted)
cv_prep_lr <- cv_models_lr %>%
mutate(
# Prepare binary vector of actual Attrition values in validate
validate_actual = map(.x=validate, ~.x$Attrition == "Yes"),
# Prepare binary vector of predicted Attrition values for validate
validate_predicted = map2(.x=model, .y=validate, .f=~predict(.x, .y, type = "response") > 0.5)
)
# Calculate the validate recall for each cross validation fold
cv_perf_recall <- cv_prep_lr %>%
mutate(validate_recall = map2_dbl(.x=validate_actual, .y=validate_predicted, .f=~recall(actual = .x, predicted = .y)))
# Print the validate_recall column
cv_perf_recall$validate_recall
# Calculate the average of the validate_recall column
mean(cv_perf_recall$validate_recall)
library(ranger)
# Prepare for tuning your cross validation folds by varying mtry
cv_tune <- cv_data %>%
crossing(mtry = c(2, 4, 8, 16))
# Build a cross validation model for each fold & mtry combination
cv_models_rf <- cv_tune %>%
mutate(model = map2(train, mtry, ~ranger(formula = Attrition~.,
data = .x, mtry = .y,
num.trees = 100, seed = 42)))
cv_prep_rf <- cv_models_rf %>%
mutate(
# Prepare binary vector of actual Attrition values in validate
validate_actual = map(validate, ~.x$Attrition == "Yes"),
# Prepare binary vector of predicted Attrition values for validate
validate_predicted = map2(.x=model, .y=validate, ~predict(.x, .y, type = "response")$predictions=="Yes")
)
# Calculate the validate recall for each cross validation fold
cv_perf_recall <- cv_prep_rf %>%
mutate(recall = map2_dbl(.x=validate_actual, .y=validate_predicted, ~recall(actual=.x, predicted=.y)))
# Calculate the mean recall for each mtry used
cv_perf_recall %>%
group_by(mtry) %>%
summarise(mean_recall = mean(recall))
# Build the logistic regression model using all training data
best_model <- glm(formula = Attrition ~ .,
data = training_data, family = "binomial")
# Prepare binary vector of actual Attrition values for testing_data
test_actual <- testing_data$Attrition == "Yes"
# Prepare binary vector of predicted Attrition values for testing_data
test_predicted <- predict(best_model, newdata=testing_data, type = "response") > 0.5
# Compare the actual & predicted performance visually using a table
table(test_actual, test_predicted)
# Calculate the test accuracy
accuracy(test_actual, test_predicted)
# Calculate the test precision
precision(test_actual, test_predicted)
# Calculate the test recall
recall(test_actual, test_predicted)
Chapter 1 - Introduction, Networks, and Labeled Networks
Introduction:
Labeled Networks, Social Influence:
Challenges:
Example code includes:
library(igraph)
load("./RInputFiles/StudentEdgelist.RData")
# Inspect edgeList
str(edgeList)
head(edgeList)
# Construct the igraph object
network <- graph_from_data_frame(edgeList, directed = FALSE)
# View your igraph object
network
load("./RInputFiles/StudentCustomers.RData")
# Inspect the customers dataframe
str(customers)
head(customers)
# Count the number of churners and non-churners
table(customers$churn)
# Add a node attribute called churn
V(network)$churn <- customers$churn
# useVerts <- c('1', '10', '100', '1000', '101', '102', '103', '104', '105', '106', '107', '109', '11', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '12', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '13', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '14', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '15', '150', '152', '153', '154', '155', '156', '157', '158', '159', '16', '160', '161', '162', '163', '164', '165', '166', '167', '168', '169', '17', '170', '171', '172', '173', '174', '175', '176', '177', '178', '179', '18', '180', '181', '182', '183', '184', '185', '186', '187', '188', '189', '19', '190', '191', '192', '193', '194', '195', '196', '197', '198', '199', '2', '20', '200', '201', '202', '204', '205', '206', '207', '208', '209', '21', '210', '211', '212', '213', '214', '215', '216', '217', '218', '219', '22', '220', '221', '222', '223', '224', '225', '227', '228', '229', '23', '230', '231', '232', '233', '234', '235', '236', '237', '238', '239', '24', '240', '241', '242', '243', '244', '245', '246', '247', '248', '249', '25', '250', '251', '252', '253', '254', '255', '256', '257', '258', '259', '26', '260', '261', '262', '263', '264', '265', '266', '267', '269', '27', '270', '271', '272', '273', '274', '276', '277', '278', '279', '28', '280', '281', '282', '283', '284', '285', '286', '287', '288', '289', '29', '290', '291', '292', '293', '294', '296', '297', '299', '3', '300', '301', '302', '303', '304', '305', '306', '307', '308', '309', '310', '311', '312', '313', '314', '315', '316', '317', '318', '319', '32', '320', '321', '322', '323', '324', '325', '326', '327', '328', '329', '330', '331', '332', '333', '334', '335', '336', '337', '338', '34', '340', '341', '342', '343', '344', '345', '346', '347', '348', '349', '35', '351', '352', '353', '354', '356', '357', '358', '359', '360', '361', '362', '363', '364', '365', '366', '367', '368', '369', '37', '370', '371', '372', '373', '374', '375', '376', '377', '378', '379', '38', '380', '381', '382', '383', '384', '385', '386', '388', '39', '390', '391', '392', '393', '394', '395', '396', '397', '398', '399', '4', '40', '400', '401', '402', '403', '404', '405', '406', '407', '408', '409', '41', '410', '411', '412', '413', '414', '415', '416', '417', '419', '42', '420', '422', '423', '424', '425', '426', '427', '428', '429', '43', '430', '431', '433', '434', '435', '436', '437', '438', '439', '44', '440', '441', '442', '443', '444', '445', '446', '447', '448', '449', '45', '450', '451', '452', '453', '454', '455', '456', '457', '458', '459', '460', '461', '462', '463', '464', '465', '466', '467', '468', '469', '47', '470', '471', '472', '473', '474', '475', '476', '477', '478', '479', '48', '480', '481', '482', '483', '484', '485', '486', '487', '488', '489', '49', '490', '491', '492', '493', '494', '495', '496', '497', '498', '499')
# useVerts <- c(useVerts, '5', '50', '500', '501', '502', '503', '504', '505', '506', '507', '508', '509', '51', '510', '511', '513', '514', '515', '516', '517', '518', '519', '52', '520', '521', '522', '523', '524', '525', '526', '527', '528', '529', '53', '530', '531', '532', '533', '534', '535', '536', '537', '538', '539', '54', '540', '541', '543', '544', '545', '546', '547', '548', '549', '55', '550', '551', '552', '553', '554', '555', '556', '557', '558', '559', '56', '560', '561', '562', '563', '564', '565', '566', '567', '568', '569', '57', '570', '571', '573', '575', '576', '577', '578', '579', '58', '580', '581', '582', '583', '584', '585', '587', '588', '589', '59', '590', '591', '592', '593', '594', '595', '596', '597', '598', '599', '6', '60', '600', '601', '602', '603', '604', '605', '606', '607', '608', '609', '61', '610', '611', '612', '613', '614', '615', '617', '618', '619', '62', '620', '621', '622', '623', '624', '625', '626', '627', '628', '629', '63', '630', '631', '632', '633', '634', '635', '636', '637', '638', '64', '640', '641', '642', '643', '644', '645', '646', '647', '648', '649', '65', '650', '651', '652', '653', '654', '655', '656', '657', '658', '659', '66', '660', '661', '662', '663', '664', '665', '666', '667', '668', '669', '67', '670', '671', '672', '673', '674', '675', '676', '677', '678', '679', '68', '680', '681', '682', '683', '684', '685', '686', '687', '688', '689', '69', '690', '691', '692', '694', '695', '696', '697', '698', '699', '7', '700', '701', '702', '703', '704', '705', '706', '707', '708', '709', '71', '711', '713', '714', '715', '716', '717', '718', '719', '72', '720', '721', '722', '723', '724', '725', '726', '728', '729', '73', '730', '731', '732', '733', '734', '735', '736', '737', '738', '739', '74', '740', '741', '742', '743', '744', '745', '746', '748', '749', '75', '751', '752', '753', '754', '755', '756', '757', '758', '759', '76', '760', '761', '762', '763', '764', '765', '766', '767', '768', '769', '77', '770', '771', '772', '773', '774', '775', '776', '777', '778', '779', '78', '780', '781', '782', '783', '784', '785', '786', '787', '788', '789', '79', '790', '791', '792', '793', '794', '795', '796', '797', '798', '799', '8', '80', '800', '801', '802', '803', '804', '805', '806', '807', '808', '809', '81', '810', '811', '812', '813', '814', '816', '817', '818', '819', '82', '820', '821', '822', '823', '824', '825', '826', '827', '828', '829', '83', '830', '831', '832', '833', '834', '835', '836', '837', '838', '839', '84', '840', '842', '843', '844', '845', '846', '847', '849', '85', '850', '851', '852', '853', '854', '855', '856', '857', '858', '859', '86', '860', '861', '862', '863', '864', '865', '866', '867', '868', '869', '87', '870', '871', '872', '873', '874', '875', '876', '877', '878', '879', '88', '880', '881', '882', '883', '884', '885', '886', '887', '888', '889', '89', '890', '891', '892', '894', '895', '896', '897', '898', '90', '901', '902', '903', '904', '905', '906', '907', '908', '909', '91', '910', '911', '912', '913', '914', '915', '916', '917', '918', '919', '92', '920', '921', '922', '923', '924', '925', '926', '927', '928', '929', '93', '930', '931', '932', '933', '934', '935', '936', '937', '938', '939', '94', '940', '941', '942', '943', '944', '945', '946', '947', '948', '949', '95', '950', '951', '952', '953', '954', '955', '956', '957', '958', '959', '96', '960', '961', '963', '964', '965', '966', '967', '968', '969', '97', '970', '971', '972', '973', '974', '975', '976', '977', '979', '98', '980', '981', '982', '983', '984', '985', '986', '987', '988', '989', '99', '990', '991', '992', '993', '994', '995', '996', '997', '998', '999')
# useVertNums <- match(useVerts, V(network))
# useNetwork <- induced_subgraph(network, useVertNums)
useNetwork <- network
useNetwork
# Visualize the network (pretty messy)
# plot(useNetwork, vertex.label = NA, edge.label = NA, edge.color = 'black', vertex.size = 2)
# Add a node attribute called color
V(useNetwork)$color <- V(useNetwork)$churn
# Change the color of churners to red and non-churners to white
V(useNetwork)$color <- gsub("1", "red", V(useNetwork)$color)
V(useNetwork)$color <- gsub("0", "white", V(useNetwork)$color)
# Plot the network (pretty messy)
# plot(useNetwork, vertex.label = NA, edge.label = NA, edge.color = 'black', vertex.size = 2)
# Create a subgraph with only churners
churnerNetwork <- induced_subgraph(useNetwork, v = V(useNetwork)[which(V(useNetwork)$churn == 1)])
# Plot the churner network
plot(churnerNetwork, vertex.label = NA, vertex.size = 2)
ctNeighbors <- function(v) {
tmp <- V(useNetwork)[neighbors(useNetwork, v, mode="all")]$churn
c(sum(tmp==0), sum(tmp==1))
}
mtxNeighbors <- sapply(V(useNetwork), FUN=ctNeighbors)
NonChurnNeighbors <- mtxNeighbors[1, ]
ChurnNeighbors <- mtxNeighbors[2, ]
# Compute the churn probabilities
churnProb <- ChurnNeighbors / (ChurnNeighbors + NonChurnNeighbors)
# Find who is most likely to churn
mostLikelyChurners <- which(churnProb == max(churnProb))
# Extract the IDs of the most likely churners
customers$id[mostLikelyChurners]
# Find churn probability of the 44th customer
churnProb[44]
# Update the churn probabilties and the non-churn probabilities
AdjacencyMatrix <- as_adjacency_matrix(useNetwork)
nNeighbors <- colSums(mtxNeighbors)
churnProb_updated <- as.vector((AdjacencyMatrix %*% churnProb) / nNeighbors)
# Find updated churn probability of the 44th customer
churnProb_updated[44]
# Compute the AUC
pROC::auc(customers$churn, as.vector(churnProb))
# Write a for loop to update the probabilities
for(i in 1:10){
churnProb <- as.vector((AdjacencyMatrix %*% churnProb) / nNeighbors)
}
# Compute the AUC again
pROC::auc(customers$churn, as.vector(churnProb))
Chapter 2 - Homophily
Homophily:
Dyadicity:
Heterophilicity:
Summary:
Example code includes:
# Add the column edgeList$FromLabel
edgeList$FromLabel <- customers[match(edgeList$from, customers$id), 2]
# Add the column edgeList$ToLabel
edgeList$ToLabel <- customers[match(edgeList$to, customers$id), 2]
# Add the column edgeList$edgeType
edgeList$edgeType <- edgeList$FromLabel + edgeList$ToLabel
# Count the number of each type of edge
table(edgeList$edgeType)
# Count churn edges
ChurnEdges <- sum(edgeList$edgeType == 2)
# Count non-churn edges
NonChurnEdges <- sum(edgeList$edgeType == 0)
# Count mixed edges
MixedEdges <- sum(edgeList$edgeType == 1)
# Count all edges
edges <- ChurnEdges + NonChurnEdges + MixedEdges
#Print hte number of edges
edges
# Count the number of churn nodes
ChurnNodes <- sum(customers$churn == 1)
# Count the number of non-churn nodes
NonChurnNodes <- sum(customers$churn == 0)
# Count the total number of nodes
nodes <- ChurnNodes + NonChurnNodes
# Compute the network connectance
connectance <- 2 * edges / nodes / (nodes - 1)
# Print the value
connectance
# Compute the expected churn dyadicity
ExpectedDyadChurn <- ChurnNodes * (ChurnNodes - 1) * connectance / 2
# Compute the churn dyadicity
DyadChurn <- ChurnEdges / ExpectedDyadChurn
# Inspect the value
DyadChurn
# Compute the expected heterophilicity
ExpectedHet <- NonChurnNodes * ChurnNodes * connectance
# Compute the heterophilicity
Het <- MixedEdges / ExpectedHet
# Inspect the heterophilicity
Het
Chapter 3 - Network Featurization